# MCP From Scratch

- LangGraph のドキュメントをもとに回答する RAG を MCP ツールとして登録する
- https://mirror-feeling-d80.notion.site/MCP-From-Scratch-1b9808527b178040b5baf83a991ed3b2


In [1]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

## RAG を行う LangChain のツールを作成


In [3]:
import re
import os
import tiktoken

from bs4 import BeautifulSoup

from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import SKLearnVectorStore

In [4]:
def count_tokens(text, model="cl100k_base"):
    """
    Count the number of tokens in the text using tiktoken.

    Args:
        text (str): The text to count tokens for
        model (str): The tokenizer model to use (default: cl100k_base for GPT-4)

    Returns:
        int: Number of tokens in the text
    """
    encoder = tiktoken.get_encoding(model)
    return len(encoder.encode(text))

In [5]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")

    # Target the main article content for LangGraph documentation
    main_content = soup.find("article", class_="md-content__inner")

    # If found, use that, otherwise fall back to the whole document
    content = main_content.get_text() if main_content else soup.text

    # Clean up whitespace
    content = re.sub(r"\n\n+", "\n\n", content).strip()

    return content

In [6]:
def load_langgraph_docs():
    """
    Load LangGraph documentation from the official website.

    This function:
    1. Uses RecursiveUrlLoader to fetch pages from the LangGraph website
    2. Counts the total documents and tokens loaded

    Returns:
        list: A list of Document objects containing the loaded content
        list: A list of tokens per document
    """
    print("Loading LangGraph documentation...")

    # Load the documentation
    urls = [
        "https://langchain-ai.github.io/langgraph/concepts/",
        "https://langchain-ai.github.io/langgraph/how-tos/",
        "https://langchain-ai.github.io/langgraph/tutorials/workflows/",
        "https://langchain-ai.github.io/langgraph/tutorials/introduction/",
        "https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/",
    ]

    docs = []
    for url in urls:
        loader = RecursiveUrlLoader(
            url,
            max_depth=5,
            extractor=bs4_extractor,
        )

        # Load documents using lazy loading (memory efficient)
        docs_lazy = loader.lazy_load()

        # Load documents and track URLs
        for d in docs_lazy:
            docs.append(d)

    print(f"Loaded {len(docs)} documents from LangGraph documentation.")
    print("\nLoaded URLs:")
    for i, doc in enumerate(docs):
        print(f"{i + 1}. {doc.metadata.get('source', 'Unknown URL')}")

    # Count total tokens in documents
    total_tokens = 0
    tokens_per_doc = []
    for doc in docs:
        total_tokens += count_tokens(doc.page_content)
        tokens_per_doc.append(count_tokens(doc.page_content))
    print(f"Total tokens in loaded documents: {total_tokens}")

    return docs, tokens_per_doc

In [7]:
def save_llms_full(documents):
    """Save the documents to a file"""

    # Open the output file
    output_filename = "llms_full.txt"

    with open(output_filename, "w") as f:
        # Write each document
        for i, doc in enumerate(documents):
            # Get the source (URL) from metadata
            source = doc.metadata.get("source", "Unknown URL")

            # Write the document with proper formatting
            f.write(f"DOCUMENT {i + 1}\n")
            f.write(f"SOURCE: {source}\n")
            f.write("CONTENT:\n")
            f.write(doc.page_content)
            f.write("\n\n" + "=" * 80 + "\n\n")

    print(f"Documents concatenated into {output_filename}")

In [8]:
def split_documents(documents):
    """
    Split documents into smaller chunks for improved retrieval.

    This function:
    1. Uses RecursiveCharacterTextSplitter with tiktoken to create semantically meaningful chunks
    2. Ensures chunks are appropriately sized for embedding and retrieval
    3. Counts the resulting chunks and their total tokens

    Args:
        documents (list): List of Document objects to split

    Returns:
        list: A list of split Document objects
    """
    print("Splitting documents...")

    # Initialize text splitter using tiktoken for accurate token counting
    # chunk_size=8,000 creates relatively large chunks for comprehensive context
    # chunk_overlap=500 ensures continuity between chunks
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=8000, chunk_overlap=500
    )

    # Split documents into chunks
    split_docs = text_splitter.split_documents(documents)

    print(f"Created {len(split_docs)} chunks from documents.")

    # Count total tokens in split documents
    total_tokens = 0
    for doc in split_docs:
        total_tokens += count_tokens(doc.page_content)

    print(f"Total tokens in split documents: {total_tokens}")

    return split_docs

In [9]:
def create_vectorstore(splits):
    """
    Create a vector store from document chunks using SKLearnVectorStore.

    This function:
    1. Initializes an embedding model to convert text into vector representations
    2. Creates a vector store from the document chunks

    Args:
        splits (list): List of split Document objects to embed

    Returns:
        SKLearnVectorStore: A vector store containing the embedded documents
    """
    print("Creating SKLearnVectorStore...")

    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

    # Create vector store from documents using SKLearn
    persist_path = os.getcwd() + "/sklearn_vectorstore.parquet"
    vectorstore = SKLearnVectorStore.from_documents(
        documents=splits,
        embedding=embeddings,
        persist_path=persist_path,
        serializer="parquet",
    )
    print("SKLearnVectorStore created successfully.")

    vectorstore.persist()
    print("SKLearnVectorStore was persisted to", persist_path)

    return vectorstore

In [10]:
# Load the documents
documents, tokens_per_doc = load_langgraph_docs()

# Save the documents to a file
save_llms_full(documents)

# Split the documents
split_docs = split_documents(documents)

# Create the vector store
vectorstore = create_vectorstore(split_docs)

Loading LangGraph documentation...
Loaded 128 documents from LangGraph documentation.

Loaded URLs:
1. https://langchain-ai.github.io/langgraph/concepts/
2. https://langchain-ai.github.io/langgraph/concepts/persistence/
3. https://langchain-ai.github.io/langgraph/concepts/high_level/
4. https://langchain-ai.github.io/langgraph/concepts/time-travel/
5. https://langchain-ai.github.io/langgraph/concepts/functional_api/
6. https://langchain-ai.github.io/langgraph/concepts/v0-human-in-the-loop/
7. https://langchain-ai.github.io/langgraph/concepts/memory/
8. https://langchain-ai.github.io/langgraph/concepts/durable_execution/
9. https://langchain-ai.github.io/langgraph/concepts/streaming/
10. https://langchain-ai.github.io/langgraph/concepts/low_level/
11. https://langchain-ai.github.io/langgraph/concepts/agentic_concepts/
12. https://langchain-ai.github.io/langgraph/concepts/multi_agent/
13. https://langchain-ai.github.io/langgraph/concepts/breakpoints/
14. https://langchain-ai.github.io/la

In [12]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

query = "LangGraphとはなんですか？"
relevant_docs = retriever.invoke(query)
print(f"Retrieved {len(relevant_docs)} relevant documents")

for d in relevant_docs:
    print(d.metadata["source"])
    print(d.page_content[0:200])
    print("\n--------------------------------\n")

Retrieved 3 relevant documents
https://langchain-ai.github.io/langgraph/concepts/high_level/
Why LangGraph?¶
LLM applications¶
LLMs make it possible to embed intelligence into a new class of applications. There are many patterns for building applications that use LLMs. Workflows have scaffold

--------------------------------

https://langchain-ai.github.io/langgraph/concepts/high_level/
Why LangGraph?¶
LLM applications¶
LLMs make it possible to embed intelligence into a new class of applications. There are many patterns for building applications that use LLMs. Workflows have scaffold

--------------------------------

https://langchain-ai.github.io/langgraph/concepts/langgraph_platform/
LangGraph Platform¶
Watch this 4-minute overview of LangGraph Platform to see how it helps you build, deploy, and evaluate agentic applications.

Overview¶
LangGraph Platform is a commercial solution 

--------------------------------



In [13]:
from langchain_core.tools import tool


@tool
def langgraph_query_tool(query: str):
    """
    Query the LangGraph documentation using a retriever.

    Args:
        query (str): The query to search the documentation with

    Returns:
        str: A str of the retrieved documents
    """
    retriever = SKLearnVectorStore(
        embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
        persist_path=os.getcwd() + "/sklearn_vectorstore.parquet",
        serializer="parquet",
    ).as_retriever(search_kwargs={"k": 3})

    relevant_docs = retriever.invoke(query)
    print(f"Retrieved {len(relevant_docs)} relevant documents")
    formatted_context = "\n\n".join(
        [
            f"==DOCUMENT {i + 1}==\n{doc.page_content}"
            for i, doc in enumerate(relevant_docs)
        ]
    )
    return formatted_context

In [14]:
from langchain_anthropic import ChatAnthropic


llm = ChatAnthropic(model="claude-3-7-sonnet-latest", temperature=0)
augmented_llm = llm.bind_tools([langgraph_query_tool])

instructions = """You are a helpful assistant that can answer questions about the LangGraph documentation. 
Use the langgraph_query_tool for any questions about the documentation.
If you don't know the answer, say "I don't know."""

messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": "LangGraphの特徴はなんですか？"},
]

message = augmented_llm.invoke(messages)
message.pretty_print()


[{'text': 'LangGraphの特徴について調べてみますね。LangGraphのドキュメントから情報を取得します。', 'type': 'text'}, {'id': 'toolu_01E6UT2vxs9XRM1kszyuMY6F', 'input': {'query': 'LangGraph features characteristics'}, 'name': 'langgraph_query_tool', 'type': 'tool_use'}]
Tool Calls:
  langgraph_query_tool (toolu_01E6UT2vxs9XRM1kszyuMY6F)
 Call ID: toolu_01E6UT2vxs9XRM1kszyuMY6F
  Args:
    query: LangGraph features characteristics
