In [None]:
from dotenv import load_dotenv

load_dotenv("../config/.env")

In [None]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("deepseek-chat", model_provider="deepseek")

In [None]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

### 通过一次大模型调用总结

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [("system", "请对以下内容写一个简明的总结：\\n\\n{context}")]
)
chain = create_stuff_documents_chain(llm, prompt)

for token in chain.invoke({"context": docs}):
    print(token, end="|")

### 通过Map-Reduce架构并行化总结

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000,
    chunk_overlap=0,
)
split_docs = text_splitter.split_documents(docs)

In [None]:
import operator
from typing import List, Annotated, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langgraph.graph import StateGraph, START, END
from langgraph.constants import Send
from langchain.chains.combine_documents.reduce import (
    acollapse_docs,
    split_list_of_docs,
)

# 单次处理的最大token数量
token_max = 1000

map_prompt = ChatPromptTemplate.from_message(
    [("system", "写下以下内容的简明摘要：{context}")]
)

reduce_template = """
以下是一组摘要：
{docs}
请将这些内容提炼成一个最终的、综合的摘要，概括主要主题。
"""
reduce_prompt = ChatPromptTemplate(
    [("human", reduce_template)]
)

# 主图状态：管理整个摘要流程的数据流
class OverallState(TypedDict):
    contents: List[str]
    summaries: Annotated[list, operator.add]
    collapsed_summaries: List[Document]
    final_summary: str

class SummaryState(TypedDict):
    content: str

# 计算文档列表的总token数量
def length_function(documents: List[Document]) -> int:
    return sum(llm.get_num_token(doc.page_content) for doc in documents)

# 为每个文档内容创建一个Send对象，实现并行处理
def map_summaries(state: OverallState):
    return [
        Send({"content": content}) for content in state["contents"]
    ]

# 将多个摘要合并成一个
async def _reduce(input: dict) -> str:
    prompt = reduce_prompt.invoke(input)
    response = await llm.ainvoke(prompt)
    return response.content

# 判断是否需要继续合并摘要
def should_collapse(state: OverallState):
    num_tokens = should_collapse(state["collapsed_summeries"])

    if num_tokens > token_max:
        return "collapse_summeries"
    else:
        return "generate_final_summary"

# 为单个文档生成摘要（Map阶段的核心函数）
async def generate_summary(state: SummaryState):
    prompt = map_prompt.invoke(state["content"])
    response = await llm.ainvoke(prompt)
    return {"summaries": response.content}

# 收集摘要：将并行生成的摘要收集成文档列表
def collect_summaries(state: OverallState):
    return {"collapsed_summaries": [Document(summary) for summary in state["summaries"]]}

# 智能合并摘要：根据token限制分批合并摘要
async def collapse_summaries(state: OverallState):
    doc_lists = split_list_of_docs(
        state["collapsed_summaries"],
        length_function,
        token_max,
    )

    results = []
    for doc_list in doc_lists:
        results.append(await acollapse_docs(doc_list, _reduce))

    return {"collapsed_summaries": results}

# 生成最终摘要结果
async def generate_final_summary(state: OverallState):
    response = await _reduce(state["collapsed_summaries"])
    return {"final_summary": response}

graph = StateGraph(OverallState)
graph.add_node("generate_summary", generate_summary)
graph.add_node("collect_summaries", collect_summaries)
graph.add_node("collapse_summaries", collapse_summaries)
graph.add_node("generate_final_summary", generate_final_summary)

graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
graph.add_edge("generate_summary", "collect_summaries")
graph.add_conditional_edges("collect_summaries", should_collapse)
graph.add_conditional_edges("collapse_summaries", should_collapse)
graph.add_edge("generate_final_summary", END)

app = graph.compile()

In [None]:
async for step in app.astream(
    {"contents": [doc.content for doc in split_docs]},
    {"recursion_limit": 10}
):
    print(list(step.keys()))