# 0 创建 数据库 客户端

In [1]:
from pymilvus import MilvusClient, DataType, Function, FunctionType

milvus_client = MilvusClient(uri="./milvus_demo.db")

  from pkg_resources import DistributionNotFound, get_distribution


# 1 加载文档 md

URI = './doc/md/LangChain V1.0'

In [25]:
path = './doc/md/LangChain V1.0/'

import glob
from langchain_text_splitters  import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

text_line = []

for file_path in glob.glob(path + "*.md"):
    with open(file_path, 'r') as file:
        text = file.read()
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(text)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,  # 每个块最大字符数
        chunk_overlap=200,  # 重叠字符数
        separators=["\n\n", "\n", " ", ""]  # 分割优先级
    )
    chunks = text_splitter.split_documents(md_header_splits)

    text_line = [chunk.page_content for chunk in chunks]

In [27]:
# len(glob.glob(path + "*.md"))
print(len(text_line),text_line[3])

20 Next, build a practical weather forecasting agent that demonstrates key production concepts:  
1. **Detailed system prompts** for better agent behavior
2. **Create tools** that integrate with external data
3. **Model configuration** for consistent responses
4. **Structured output** for predictable results
5. **Conversational memory** for chat-like interactions
6. **Create and run the agent** create a fully functional agent  
Let's walk through each step:  
<Steps>
<Step title="Define the system prompt">
The system prompt defines your agent’s role and behavior. Keep it specific and actionable:  
```python wrap theme={null}
SYSTEM_PROMPT = """You are an expert weather forecaster, who speaks in puns.

You have access to two tools:


# 2 准备嵌入模型 GLM ： embedding3

In [3]:
# pip install -qU zhipuai

import getpass
import os

if not os.getenv("ZHIPUAI_API_KEY"):
    os.environ["ZHIPUAI_API_KEY"] = getpass.getpass("Enter your ZhipuAI API key: ")

from langchain_community.embeddings import ZhipuAIEmbeddings

dim = 512
embeddings = ZhipuAIEmbeddings(
    model="embedding-3",
    # With the `embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    dimensions=dim,
)

In [4]:
def get_embedding(text):
    ''' return  embedding vector for a given text '''
    response = []
    for line in text:
        embedding_vectore = embeddings.embed_query(line)
        response.append(embedding_vectore)
    return response

# 3 将数据载入 Milvus

## 3.1 创建 Collections

### 3.1.1 设计 schema

In [5]:
schema = MilvusClient.create_schema()

# primery field
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
# vector field
schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=dim)
# text field
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=4000, enable_analyze=True)
# sparse vector field
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4000}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

### 3.1.2 定义一个将文本转换为稀疏向量表示的函数 

In [6]:
bm25_function = Function(
    name="text_bm25_emb", # Function name
    input_field_names=["text"], # Name of the VARCHAR field containing raw text data
    output_field_names=["sparse"], # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25, # Set to `BM25`
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4000}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

### 3.1.3 配置索引

In [9]:
index_params = milvus_client.prepare_index_params()

index_params.add_index(
    field_name="sparse",

    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={
        "inverted_index_algo": "DAAT_MAXSCORE",
        "bm25_k1": 1.2,
        "bm25_b": 0.75
    }
)

index_params.add_index(
    field_name="embedding",
    index_type="AUTOINDEX",
    metric_type="IP",
)

### 3.1.4 创建 collection

In [10]:
collection_name = "rag_langchain_v1"

# if the collection already exists, drop it
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

milvus_client.create_collection(
    collection_name=collection_name,
    schema=schema,
    consistency_level="Bounded", # 数据一致性 需要写笔记学习一下
    index_params=index_params,
)

In [11]:
indexes = milvus_client.list_indexes(collection_name)
print("Indexes before creation:", indexes)

Indexes before creation: ['embedding', 'sparse']


## 3.2 插入数据
文档 位于 ./doc/LangChain V1.0

In [28]:
print(len(text_line))

20


In [29]:
from tqdm import tqdm

data = []

embeddings_list = get_embedding(text_line)

for i, line in enumerate(tqdm(text_line, desc="Creating embeddings")):
    data.append({
        "id": i,
        "embedding": embeddings_list[i],
        "text": line,
    })
milvus_client.insert(collection_name=collection_name, data=data)

Creating embeddings: 100%|██████████| 20/20 [00:00<00:00, 37820.60it/s]


{'insert_count': 20, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'cost': 0}

# 4 构建 RAG

## 检索数据

In [30]:
question = " What's the new feature of LangChain V1.0?"

In [31]:
embedding = get_embedding([question])
print(type(embedding), len(embedding), type(embedding[0]), len(embedding[0]))  # 检查类型和前5个值

<class 'list'> 1 <class 'list'> 512


In [32]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=get_embedding([question]),  # Use the `get_embedding` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    anns_field="embedding",
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

In [33]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


[
    [
        "print(response['structured_response'])\n# ResponseFormat(\n#     punny_response=\"You're 'thund-erfully' welcome! It's always a 'breeze' to help you stay 'current' with the weather. I'm just 'cloud'-ing around waiting to 'shower' you with more forecasts whenever you need them. Have a 'sun-sational' day in the Florida sunshine!\",\n#     weather_conditions=None\n# )\n```\n</Expandable>  \n<Tip>\nTo learn how to trace your agent with LangSmith, see the [LangSmith documentation](/langsmith/trace-with-langchain).\n</Tip>  \nCongratulations! You now have an AI agent that can:  \n* **Understand context** and remember conversations\n* **Use multiple tools** intelligently\n* **Provide structured responses** in a consistent format\n* **Handle user-specific information** through context",
        0.20912817120552063
    ],
    [
        "from langchain.agents import create_agent\nfrom langchain.chat_models import init_chat_model\nfrom langchain.tools import tool, ToolRuntime\nfr