In [4]:
import asyncio
from contextlib import AsyncExitStack

from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client


async def test_arxiv():
    # Use uvx so collaborators don’t need a prior tool install
    server = StdioServerParameters(
        command="uvx",
        args=["arxiv-mcp-server"],
        env=None,  # add env if the server needs any
        stderr="pipe",  # capture server logs
    )

    async with AsyncExitStack() as stack:
        # Start transport (stdio pipes)
        read, write = await stack.enter_async_context(stdio_client(server))

        # Show any server stderr in the background (helps when debugging)
        if hasattr(read, "stderr_reader"):  # some versions expose this; otherwise ignore

            async def drain_stderr():
                async for line in read.stderr_reader:
                    print("[server]", line.decode().rstrip())

            stack.push_async_callback(lambda: None)  # placeholder if not available

        # Create a session and initialize with an explicit name
        session = await stack.enter_async_context(ClientSession(read, write))
        try:
            await asyncio.wait_for(session.initialize(), timeout=5)
        except TimeoutError:
            raise RuntimeError("Timed out waiting for MCP initialize() response") from None

        # List tools to verify connectivity
        tools = await session.list_tools()
        print("Tools:", [t.name for t in tools.tools])

        # Call the tool
        res = await session.call_tool(
            "search_papers",
            {
                "query": "transformer architecture",
                "max_results": 5,
                "date_from": "2023-01-01",
                "categories": ["cs.AI", "cs.LG"],
            },
        )
        print("Result:", res)


await test_arxiv()


Tools: ['search_papers', 'download_paper', 'list_papers', 'read_paper']
Result: meta=None content=[TextContent(type='text', text='{\n  "total_results": 5,\n  "papers": [\n    {\n      "id": "2506.19125v1",\n      "title": "Finding Clustering Algorithms in the Transformer Architecture",\n      "authors": [\n        "Kenneth L. Clarkson",\n        "Lior Horesh",\n        "Takuya Ito",\n        "Charlotte Park",\n        "Parikshit Ram"\n      ],\n      "abstract": "The invention of the transformer architecture has revolutionized Artificial\\nIntelligence (AI), yielding unprecedented success in areas such as natural\\nlanguage processing, computer vision, and multimodal reasoning. Despite these\\nadvances, it is unclear whether transformers are able to learn and implement\\nprecise algorithms. Here, we demonstrate that transformers can exactly\\nimplement a fundamental and widely used algorithm for $k$-means clustering:\\nLloyd\'s algorithm. First, we theoretically prove the existence of 

In [None]:
Tools: ['search_papers', 'download_paper', 'list_papers', 'read_paper']
Result: meta=None content=[TextContent(type='text', text='{\n  "total_results": 5,\n  "papers": [\n    {\n      "id": "2506.19125v1",\n      "title": "Finding Clustering Algorithms in the Transformer Architecture",\n      "authors": [\n        "Kenneth L. Clarkson",\n        "Lior Horesh",\n        "Takuya Ito",\n        "Charlotte Park",\n        "Parikshit Ram"\n      ],\n      "abstract": "The invention of the transformer architecture has revolutionized Artificial\\nIntelligence (AI), yielding unprecedented success in areas such as natural\\nlanguage processing, computer vision, and multimodal reasoning. Despite these\\nadvances, it is unclear whether transformers are able to learn and implement\\nprecise algorithms. Here, we demonstrate that transformers can exactly\\nimplement a fundamental and widely used algorithm for $k$-means clustering:\\nLloyd\'s algorithm. First, we theoretically prove the existence of such a\\ntransformer architecture, which we term the $k$-means transformer, that exactly\\nimplements Lloyd\'s algorithm for $k$-means clustering using the standard\\ningredients of modern transformers: attention and residual connections. Next,\\nwe numerically implement this transformer and demonstrate in experiments the\\nexact correspondence between our architecture and Lloyd\'s algorithm, providing\\na fully neural implementation of $k$-means clustering. Finally, we demonstrate\\nthat interpretable alterations (e.g., incorporating layer normalizations or\\nmultilayer perceptrons) to this architecture yields diverse and novel variants\\nof clustering algorithms, such as soft $k$-means, spherical $k$-means, trimmed\\n$k$-means, and more. Collectively, our findings demonstrate how transformer\\nmechanisms can precisely map onto algorithmic procedures, offering a clear and\\ninterpretable perspective on implementing precise algorithms in transformers.",\n      "categories": [\n        "cs.LG",\n        "cs.AI"\n      ],\n      "published": "2025-06-23T20:52:01+00:00",\n      "url": "http://arxiv.org/pdf/2506.19125v1",\n      "resource_uri": "arxiv://2506.19125v1"\n    },\n    {\n      "id": "2502.16533v2",\n      "title": "A Survey of Graph Transformers: Architectures, Theories and Applications",\n      "authors": [\n        "Chaohao Yuan",\n        "Kangfei Zhao",\n        "Ercan Engin Kuruoglu",\n        "Liang Wang",\n        "Tingyang Xu",\n        "Wenbing Huang",\n        "Deli Zhao",\n        "Hong Cheng",\n        "Yu Rong"\n      ],\n      "abstract": "Graph Transformers (GTs) have demonstrated a strong capability in modeling\\ngraph structures by addressing the intrinsic limitations of graph neural\\nnetworks (GNNs), such as over-smoothing and over-squashing. Recent studies have\\nproposed diverse architectures, enhanced explainability, and practical\\napplications for Graph Transformers. In light of these rapid developments, we\\nconduct a comprehensive review of Graph Transformers, covering aspects such as\\ntheir architectures, theoretical foundations, and applications within this\\nsurvey. We categorize the architecture of Graph Transformers according to their\\nstrategies for processing structural information, including graph tokenization,\\npositional encoding, structure-aware attention and model ensemble. Furthermore,\\nfrom the theoretical perspective, we examine the expressivity of Graph\\nTransformers in various discussed architectures and contrast them with other\\nadvanced graph learning algorithms to discover the connections. Furthermore, we\\nprovide a summary of the practical applications where Graph Transformers have\\nbeen utilized, such as molecule, protein, language, vision, traffic, brain and\\nmaterial data. At the end of this survey, we will discuss the current\\nchallenges and prospective directions in Graph Transformers for potential\\nfuture research.",\n      "categories": [\n        "cs.LG",\n        "cs.AI"\n      ],\n      "published": "2025-02-23T10:55:19+00:00",\n      "url": "http://arxiv.org/pdf/2502.16533v2",\n      "resource_uri": "arxiv://2502.16533v2"\n    },\n    {\n      "id": "2407.09093v2",\n      "title": "On Exact Bit-level Reversible Transformers Without Changing Architectures",\n      "authors": [\n        "Guoqiang Zhang",\n        "J. P. Lewis",\n        "W. B. Kleijn"\n      ],\n      "abstract": "Various reversible deep neural networks (DNN) models have been proposed to\\nreduce memory consumption in the training process. However, almost all existing\\nreversible DNNs either require special non-standard architectures or are\\nconstructed by modifying existing DNN architectures considerably to enable\\nreversibility. In this work we present the BDIA-transformer, which is an exact\\nbit-level reversible transformer that uses an unchanged standard architecture\\nfor inference. The basic idea is to first treat each transformer block as the\\nEuler integration approximation for solving an ordinary differential equation\\n(ODE) and then incorporate the technique of bidirectional integration\\napproximation (BDIA) into the neural architecture, together with activation\\nquantization to make it exactly bit-level reversible. In the training process,\\nwe let a hyper-parameter $\\\\gamma$ in BDIA-transformer randomly take one of the\\ntwo values $\\\\{0.5, -0.5\\\\}$ per training sample per transformer block for\\naveraging every two consecutive integration approximations. As a result,\\nBDIA-transformer can be viewed as training an ensemble of ODE solvers\\nparameterized by a set of binary random variables, which regularizes the model\\nand results in improved validation accuracy. Lightweight side information per\\ntransformer block is required to be stored in the forward process to account\\nfor binary quantization loss to enable exact bit-level reversibility. In the\\ninference procedure, the expectation $\\\\mathbb{E}(\\\\gamma)=0$ is taken to make\\nthe resulting architectures of BDIA-transformer identical to transformers up to\\nactivation quantization. Our experiments in both image classification and\\nlanguage translation show that BDIA-transformers outperform their conventional\\ncounterparts significantly in terms of validation performance while also\\nrequiring considerably less training memory.",\n      "categories": [\n        "cs.LG",\n        "cs.AI"\n      ],\n      "published": "2024-07-12T08:42:58+00:00",\n      "url": "http://arxiv.org/pdf/2407.09093v2",\n      "resource_uri": "arxiv://2407.09093v2"\n    },\n    {\n      "id": "2501.02007v1",\n      "title": "TART: Token-based Architecture Transformer for Neural Network Performance Prediction",\n      "authors": [\n        "Yannis Y. He"\n      ],\n      "abstract": "In the realm of neural architecture design, achieving high performance is\\nlargely reliant on the manual expertise of researchers. Despite the emergence\\nof Neural Architecture Search (NAS) as a promising technique for automating\\nthis process, current NAS methods still require human input to expand the\\nsearch space and cannot generate new architectures. This paper explores the\\npotential of Transformers in comprehending neural architectures and their\\nperformance, with the objective of establishing the foundation for utilizing\\nTransformers to generate novel networks. We propose the Token-based\\nArchitecture Transformer (TART), which predicts neural network performance\\nwithout the need to train candidate networks. TART attains state-of-the-art\\nperformance on the DeepNets-1M dataset for performance prediction tasks without\\nedge information, indicating the potential of Transformers to aid in\\ndiscovering novel and high-performing neural architectures.",\n      "categories": [\n        "cs.LG",\n        "cs.AI"\n      ],\n      "published": "2025-01-02T05:22:17+00:00",\n      "url": "http://arxiv.org/pdf/2501.02007v1",\n      "resource_uri": "arxiv://2501.02007v1"\n    },\n    {\n      "id": "2408.10189v2",\n      "title": "Transformers to SSMs: Distilling Quadratic Knowledge to Subquadratic Models",\n      "authors": [\n        "Aviv Bick",\n        "Kevin Y. Li",\n        "Eric P. Xing",\n        "J. Zico Kolter",\n        "Albert Gu"\n      ],\n      "abstract": "Transformer architectures have become a dominant paradigm for domains like\\nlanguage modeling but suffer in many inference settings due to their\\nquadratic-time self-attention. Recently proposed subquadratic architectures,\\nsuch as Mamba, have shown promise, but have been pretrained with substantially\\nless computational resources than the strongest Transformer models. In this\\nwork, we present a method that is able to distill a pretrained Transformer\\narchitecture into alternative architectures such as state space models (SSMs).\\nThe key idea to our approach is that we can view both Transformers and SSMs as\\napplying different forms of mixing matrices over the token sequences. We can\\nthus progressively distill the Transformer architecture by matching different\\ndegrees of granularity in the SSM: first matching the mixing matrices\\nthemselves, then the hidden units at each block, and finally the end-to-end\\npredictions. Our method, called MOHAWK, is able to distill a Mamba-2 variant\\nbased on the Phi-1.5 architecture (Phi-Mamba) using only 3B tokens and a hybrid\\nversion (Hybrid Phi-Mamba) using 5B tokens. Despite using less than 1% of the\\ntraining data typically used to train models from scratch, Phi-Mamba boasts\\nsubstantially stronger performance compared to all past open-source\\nnon-Transformer models. MOHAWK allows models like SSMs to leverage\\ncomputational resources invested in training Transformer-based architectures,\\nhighlighting a new avenue for building such models.",\n      "categories": [\n        "cs.LG",\n        "cs.AI"\n      ],\n      "published": "2024-08-19T17:48:11+00:00",\n      "url": "http://arxiv.org/pdf/2408.10189v2",\n      "resource_uri": "arxiv://2408.10189v2"\n    }\n  ]\n}', annotations=None, meta=None)] structuredContent=None isError=False