In [None]:
# =============================================================================
# 智能文档摘要系统 - 基于Map-Reduce架构的并行化摘要生成
# =============================================================================
# 核心思想：将大文档拆分成小块并行处理，然后逐步合并成最终摘要
# 优势：处理超长文档，避免单次调用token限制，提高处理效率

import operator
from typing import Annotated, List, Literal, TypedDict

from langchain.chains.combine_documents.reduce import (
    acollapse_docs,      # 异步合并文档摘要
    split_list_of_docs,  # 按token数量智能分割文档列表
)
from langchain_core.documents import Document
from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

# 全局配置：单次处理的最大token数量
token_max = 1000


def length_function(documents: List[Document]) -> int:
    """计算文档列表的总token数量"""
    return sum(llm.get_num_tokens(doc.page_content) for doc in documents)


# =============================================================================
# 状态定义：整个摘要流程的数据结构
# =============================================================================
class OverallState(TypedDict):
    """主图状态：管理整个摘要流程的数据流"""
    contents: List[str]                    # 原始文档内容列表
    summaries: Annotated[list, operator.add]  # 并行生成的摘要列表（使用operator.add自动合并）
    collapsed_summaries: List[Document]   # 合并后的摘要文档
    final_summary: str                     # 最终生成的摘要


class SummaryState(TypedDict):
    """单次摘要状态：处理单个文档的摘要生成"""
    content: str  # 待摘要的文档内容


# =============================================================================
# 核心处理函数：Map阶段 - 并行生成摘要
# =============================================================================
async def generate_summary(state: SummaryState):
    """为单个文档生成摘要（Map阶段的核心函数）"""
    prompt = map_prompt.invoke(state["content"])  # 调用摘要提示模板
    response = await llm.ainvoke(prompt)          # 异步调用LLM生成摘要
    return {"summaries": [response.content]}      # 返回摘要结果


def map_summaries(state: OverallState):
    """分发任务：将每个文档内容发送给摘要生成节点"""
    # 为每个文档内容创建一个Send对象，实现并行处理
    return [
        Send("generate_summary", {"content": content}) for content in state["contents"]
    ]


def collect_summaries(state: OverallState):
    """收集摘要：将并行生成的摘要收集成文档列表"""
    return {
        "collapsed_summaries": [Document(summary) for summary in state["summaries"]]
    }


# =============================================================================
# 核心处理函数：Reduce阶段 - 合并摘要
# =============================================================================
async def _reduce(input: dict) -> str:
    """内部合并函数：将多个摘要合并成一个"""
    prompt = reduce_prompt.invoke(input)  # 调用合并提示模板
    response = await llm.ainvoke(prompt)  # 异步调用LLM进行合并
    return response.content


async def collapse_summaries(state: OverallState):
    """智能合并摘要：根据token限制分批合并摘要"""
    # 按token数量智能分割摘要列表
    doc_lists = split_list_of_docs(
        state["collapsed_summaries"], length_function, token_max
    )
    
    results = []
    # 对每个分割的摘要组进行合并
    for doc_list in doc_lists:
        results.append(await acollapse_docs(doc_list, _reduce))
    
    return {"collapsed_summaries": results}


# =============================================================================
# 流程控制：决定是否需要继续合并
# =============================================================================
def should_collapse(
    state: OverallState,
) -> Literal["collapse_summaries", "generate_final_summary"]:
    """判断是否需要继续合并摘要"""
    num_tokens = length_function(state["collapsed_summaries"])
    
    if num_tokens > token_max:
        return "collapse_summaries"      # 超过限制，继续合并
    else:
        return "generate_final_summary"  # 在限制内，生成最终摘要


async def generate_final_summary(state: OverallState):
    """生成最终摘要：将合并后的摘要转换为最终结果"""
    response = await _reduce(state["collapsed_summaries"])
    return {"final_summary": response}


# =============================================================================
# 图结构构建：定义整个摘要流程的执行逻辑
# =============================================================================
# 创建状态图
graph = StateGraph(OverallState)

# 添加节点：定义处理步骤
graph.add_node("generate_summary", generate_summary)        # 并行摘要生成
graph.add_node("collect_summaries", collect_summaries)     # 摘要收集
graph.add_node("collapse_summaries", collapse_summaries)   # 摘要合并
graph.add_node("generate_final_summary", generate_final_summary)  # 最终摘要

# 添加边：定义执行流程
graph.add_conditional_edges(START, map_summaries, ["generate_summary"])  # 开始→分发任务
graph.add_edge("generate_summary", "collect_summaries")                  # 生成→收集
graph.add_conditional_edges("collect_summaries", should_collapse)        # 收集→判断是否需要合并
graph.add_conditional_edges("collapse_summaries", should_collapse)       # 合并→判断是否需要继续合并
graph.add_edge("generate_final_summary", END)                            # 最终摘要→结束

# 编译图：生成可执行的摘要应用
app = graph.compile()