In [36]:
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import asyncio

In [37]:
BASE_URL = "https://mcpmarket.com"
LEADERBOARDS_URL = f"{BASE_URL}/leaderboards"

In [38]:
def create_tool_card_url(relative_url):
    return f"{BASE_URL}{relative_url}"

In [39]:
async def scrape():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(LEADERBOARDS_URL)
        await page.wait_for_selector('a[id^="tool-card-"]')

        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    tool_cards = soup.select('a[id^="tool-card-"]')

    results = []
    for a in tool_cards:
        href = a.get("href")
        tool_id = a.get("id").removeprefix("tool-card-")

        results.append({
            "tool_id": tool_id,
            "href": href,
        })

    return results

In [40]:
tool_cards = await scrape()
print(len(tool_cards))

100


In [41]:
tool_cards[0]

{'tool_id': 'trendradar', 'href': '/server/trendradar'}

In [42]:
create_tool_card_url(tool_cards[0]['href'])

'https://mcpmarket.com/server/trendradar'

In [43]:
async def fetch_html_after_optional_tools_click(
    url: str,
    *,
    click_text: str = "Tools",
    wait_state: str = "networkidle",
    timeout: int = 5000,
) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(url)
        await page.wait_for_load_state(wait_state)

        # Try to click the button/tab if it exists
        locator = page.locator(f"text={click_text}")

        if await locator.count() > 0:
            await locator.first.click()
            await page.wait_for_load_state(wait_state)

        html = await page.content()
        await browser.close()

    return html

In [49]:
tool_html =await fetch_html_after_optional_tools_click(create_tool_card_url(tool_cards[0]['href']))

In [50]:
def parse_tools(html: str):
    soup = BeautifulSoup(html, "html.parser")

    tools = []

    # each tool card is separated by a bottom border section
    tool_sections = soup.select('div.pb-8.border-b')

    for section in tool_sections:
        # --- Tool name ---
        title_el = section.select_one('h3.text-lg.font-semibold')
        if not title_el:
            continue

        tool_name = title_el.get_text(strip=True)

        # --- Description (Chinese) ---
        desc_el = section.select_one('p.text-sm.text-muted-foreground')
        description = desc_el.get_text(strip=True) if desc_el else None

        # --- Parameters ---
        parameters = []

        for param in section.select('div.border.border-border\\/50'):
            name_el = param.select_one('code')
            type_el = param.select_one('span.font-mono')
            optional_el = param.select_one('span:contains("Optional")')

            if not name_el or not type_el:
                continue

            parameters.append({
                "name": name_el.get_text(strip=True),
                "type": type_el.get_text(strip=True),
                "optional": optional_el is not None
            })

        tools.append({
            "tool_name": tool_name,
            "description": description,
            "parameters": parameters
        })

    return tools

In [51]:
parse_tools(tool_html)



[{'tool_name': 'Get Latest News',
  'description': "获取最新一批爬取的新闻数据，快速了解当前热点\n\nArgs:\n    platforms: 平台ID列表，如 ['zhihu', 'weibo', 'douyin']\n               - 不指定时：使用 config.yaml 中配置的所有平台\n               - 支持的平台来自 config/config.yaml 的 platforms 配置\n            ...Show more",
  'parameters': [{'name': 'limit', 'type': 'integer', 'optional': True},
   {'name': 'platforms', 'type': 'any', 'optional': True},
   {'name': 'include_url', 'type': 'boolean', 'optional': True}]},
 {'tool_name': 'Get Trending Topics',
  'description': '获取个人关注词的新闻出现频率统计（基于 config/frequency_words.txt）\n\n注意：本工具不是自动提取新闻热点，而是统计你在 config/frequency_words.txt 中\n设置的个人关注词在新闻中出现的频率。你可以自定义这个关注词列表。\n\nArgs:\n    top_n: 返回TOP N关注词，默认10\n    mode: 模式选择\n        - daily:...Show more',
  'parameters': [{'name': 'mode', 'type': 'string', 'optional': True},
   {'name': 'top_n', 'type': 'integer', 'optional': True}]},
 {'tool_name': 'Get News By Date',
  'description': '获取指定日期的新闻数据，用于历史数据分析和对比\n\nArgs:\n    date_query: 日期查询，可选格式:\n  