In [1]:
from llama_index.core import (VectorStoreIndex, SimpleDirectoryReader, load_index_from_storage
    , Document, Settings, StorageContext, PromptTemplate)
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.extractors import KeywordExtractor, SummaryExtractor
from llama_index.core.schema import MetadataMode
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.dashscope import DashScope

from llama_index.extractors.entity import EntityExtractor
from llama_index.readers.file import UnstructuredReader,PyMuPDFReader

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

import os, re, asyncio
from tqdm.asyncio import tqdm_asyncio
from tqdm import tqdm
import json

In [2]:
embedding_model = "./Qwen3-Embedding-0.6B"
Settings.embed_model = HuggingFaceEmbedding(
    model_name=embedding_model,
    cache_folder=None,
    trust_remote_code=True,
    local_files_only=True
)

config = AutoConfig.from_pretrained(embedding_model, trust_remote_code=True, local_files_only=True)
dimension = config.hidden_size
print(f"模型嵌入维度: {dimension}")

2025-10-09 15:45:06,059 - INFO - Load pretrained SentenceTransformer: ./Qwen3-Embedding-0.6B
2025-10-09 15:45:07,089 - INFO - 1 prompt is loaded, with the key: query


模型嵌入维度: 1024


In [3]:
# main_model = "./Qwen3-4B-Thinking-2507"
# Settings.llm = HuggingFaceLLM(
#     model_name=main_model,
#     tokenizer_name=main_model,
#     generate_kwargs={"temperature": 0.1, "top_p": 0.7},
#     device_map="cuda",
#     max_new_tokens=512
# )

Settings.llm = DashScope(
    api_key="sk-7afc6caf37b64d069ef3be129e68753a",
    model="qwen3-max",
    generate_kwargs={"temperature": 0.1, "top_p": 0.7},
    max_new_tokens=512
)

In [4]:
milvus_dir = "./milvus_test"
milvus_db_path = os.path.join(milvus_dir, "milvus_lite.db")
abs_db_path = os.path.abspath(milvus_db_path)
print(f"绝对数据库路径: {abs_db_path}")

if not os.path.exists(milvus_dir):
    os.makedirs(milvus_dir)
    print("已创建 ./milvus 目录")

milvus_vector_store = MilvusVectorStore(
    uri=f"{abs_db_path}",
    collection_name="rag_collection",
    dim=1024,
    overwrite=False
)
storage_context = StorageContext.from_defaults(vector_store=milvus_vector_store)

绝对数据库路径: /root/marathon/milvus_test/milvus_lite.db


  from pkg_resources import DistributionNotFound, get_distribution


In [5]:
def clean_text(text: str) -> str:
    text = re.sub(r'\n\s*\n+', '\n\n', text).strip()
    # text = re.sub(r'(\w+\s*){3,}\n', '', text)
    # text = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fa5\s\.,!?]', '', text)  # 去除特殊字符，保留中英文
    return text


In [6]:
async def generate_summary_async(text, max_words=20):
    prompt = f"总结以下文本，不超过{max_words}字，直接回复结果：{text}"
    response = await Settings.llm.acomplete(prompt)
    return response.text.strip()

def generate_summary(text, max_words=20):
    prompt = f"总结以下文本，不超过{max_words}字，直接回复结果：{text}"
    response = Settings.llm.complete(prompt)
    return response.text.strip()

async def add_summaries_to_nodes_async(nodes_list):
    tasks = [generate_summary_async(node.text) for node in nodes_list]

    summaries = []
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="生成节点摘要进度"):
        summary = await future
        summaries.append(summary)

    for node, summary in zip(nodes_list, summaries):
        node.metadata["node_summary"] = summary
        
def add_summaries_to_nodes(nodes_list):
    for node in tqdm(nodes_list, desc="生成摘要"):
        summary = generate_summary(node.text)
        node.metadata["node_summary"] = summary


In [7]:
qwen_tokenizer = AutoTokenizer.from_pretrained("./Qwen3-Embedding-0.6B", trust_remote_code=True)
documents_dir = "./docs"

file_extractor = {
    ".pdf": PyMuPDFReader(), 
    ".docx": UnstructuredReader()
}
reader = SimpleDirectoryReader(input_dir=documents_dir, recursive=True, file_extractor=file_extractor)
documents = reader.load_data()

cleaned_documents = [Document(text=clean_text(doc.text), metadata=doc.metadata) for doc in documents]
documents = cleaned_documents

print(f"文件大小:{len(documents)}")

node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=100, tokenizer=qwen_tokenizer.tokenize)
#nodes = node_parser.get_nodes_from_documents(documents)
#print(f"节点数量:{len(nodes)}")

#asyncio.run(add_summaries_to_nodes_async(nodes))
#add_summaries_to_nodes(nodes)

文件大小:326


In [8]:
def save_summaries_to_json(nodes_list, file_path="nodes_summaries_temp.json"):
    summaries_dict = {}
    for idx, node in enumerate(nodes_list):
        summaries_dict[str(idx)] = node.metadata.get("node_summary", "")  # 获取摘要，若无则为空
    
    # 保存到 JSON
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(summaries_dict, f, ensure_ascii=False, indent=4)
    
    print(f"节点摘要已保存到 {file_path}")

def load_summaries_to_nodes(nodes_list, file_path="nodes_summaries.json"):
    with open(file_path, 'r', encoding='utf-8') as f:
        summaries_dict = json.load(f)
    sorted_keys = sorted(summaries_dict.keys(), key=int)

    for key in sorted_keys:
        idx = int(key)
        if idx < len(nodes_list):
            nodes_list[idx].metadata["node_summary"] = summaries_dict[key]
        else:
            print(f"警告：索引 {idx} 超出节点列表长度，跳过。")
    
    return nodes_list


In [40]:
#save_summaries_to_json(nodes)

节点摘要已保存到 nodes_summaries.json


In [32]:
# keyword_extractor = KeywordExtractor(
#     llm=Settings.llm,  # 使用 LLM 提取关键词作为主题
#     keywords=5,  # 提取前 5 个关键词
#     prompt_template_str="""
#     从以下文本中提取 2 个主要关键词。
#     输出格式：仅用逗号分隔的关键词列表，不要添加任何解释或额外文本。
#     示例输出：关键词1,关键词2

#     文本：{text}
#     """
# )
# max_words = 20
# summary_extractor = SummaryExtractor(
#     llm=Settings.llm,
#     summaries=["self"],
#     prompt_template_str="""
#     总结以下文本，不超过20字，直接回复结果：\n{text}
#         """
# )


In [9]:
# index = VectorStoreIndex.from_documents(
#     documents,
#     storage_context=storage_context,
#     embed_model=Settings.embed_model,
#     node_parser=node_parser,
#     store_nodes_override=True
# )



transformations = [node_parser]
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    embed_model=Settings.embed_model,
    # node_parser=node_parser,
    transformations=transformations,
    store_nodes_override=True
)


In [10]:
nodes = list(index.docstore.docs.values())
print(len(documents[0].text))
print(f"节点数量: {len(nodes)}")


108
节点数量: 353


In [11]:
loaded_nodes = load_summaries_to_nodes(nodes)

In [12]:
index = VectorStoreIndex(
    loaded_nodes, 
    storage_context=storage_context, 
    embed_model=Settings.embed_model, 
    store_nodes_override=True)

In [13]:
nodes_with_sums = list(index.docstore.docs.values())

In [None]:
for node in nodes_with_sums:
    print(node.metadata["node_summary"])

In [4]:
#####################################################
#以下是Reranker步骤

In [14]:
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.packs.fusion_retriever import HybridFusionRetrieverPack

In [15]:
reranker_model_path = "./Qwen3-Reranker-0.6B"

reranker = SentenceTransformerRerank(
    model=reranker_model_path,
    top_n=5,
    device="cuda",
    trust_remote_code=True
)

cross_encoder = reranker._model
reranker_tokenizer = cross_encoder.tokenizer
reranker_model = cross_encoder.model

special_tokens = {'pad_token': '[PAD]'}
num_added_tokens = reranker_tokenizer.add_special_tokens(special_tokens)

reranker_model.resize_token_embeddings(len(reranker_tokenizer))

reranker_tokenizer.pad_token = '[PAD]'
reranker_tokenizer.pad_token_id = reranker_tokenizer.convert_tokens_to_ids('[PAD]')
reranker_model.config.pad_token_id = reranker_tokenizer.pad_token_id

print(f"Pad token: {reranker_tokenizer.pad_token}")
print(f"Pad token ID: {reranker_tokenizer.pad_token_id}")
print(f"Model config pad_token_id: {reranker_model.config.pad_token_id}")

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at ./Qwen3-Reranker-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Pad token: [PAD]
Pad token ID: 151669
Model config pad_token_id: 151669


In [16]:
def reranker_tokenize(text):
    rerank_tokenizer = AutoTokenizer.from_pretrained("./Qwen3-Reranker-0.6B", padding_side='left')
    if not text.strip():
        return []
    tokens = rerank_tokenizer.tokenize(text)
    return tokens

In [17]:
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes_with_sums, 
    similarity_top_k=10,
    tokenizer=reranker_tokenize)
vector_retriever = VectorIndexRetriever(index=index, similarity_top_k=10)

2025-10-09 15:48:19,772 - DEBUG - Building index from IDs objects


In [18]:
hybrid_pack = HybridFusionRetrieverPack(
    nodes=nodes_with_sums,
    bm25_retriever=bm25_retriever,
    vector_retriever=vector_retriever,
    mode="reciprocal_rerank",
    similarity_top_k=10
)
hybrid_retriever = hybrid_pack.fusion_retriever

2025-10-09 15:48:39,813 - DEBUG - Building index from IDs objects


In [19]:
text_qa_template_str = (
    "上下文信息如下：\n"
    "{context_str}\n"
    "基于提供的上下文，用中文直接回答查询，答案只能从上下文知识中获取，不要自己发挥。\n"
    "查询：{query_str}\n"
    "回答："
)
text_qa_template = PromptTemplate(text_qa_template_str)

refine_template_str = (
    "原始查询是：{query_str}\n"
    "我们已有回答：{existing_answer}\n"
    "基于以下新上下文，用中文精炼现有回答，确保完整性和准确性：\n"
    "{context_msg}\n"
    "精炼后的回答："
)
refine_template = PromptTemplate(refine_template_str)

In [20]:
response_synthesizer = get_response_synthesizer(
    text_qa_template=text_qa_template,
    refine_template=refine_template,
    response_mode="compact"
)

In [21]:
query_engine = RetrieverQueryEngine(
    retriever=hybrid_retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[reranker]
)

In [22]:
query = "什么是GPT？"
response = query_engine.query(query)
print(response)

Generated queries:
Queries:
1. GPT的定义和工作原理
2. GPT模型的发展历程及其应用领域
3. 如何使用GPT进行文本生成与处理


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

GPT是一种通过大量网络数据学习的模型，旨在提升文本生成与问答能力。最初版本的GPT（Generative Pre-trained Transformer）在2018年出现，拥有约1亿1千7百万个参数，并基于相对较小的数据集进行训练。随着时间的发展，GPT系列模型逐渐增大，比如GPT-2的参数量达到了15亿，而GPT-3更是达到了GPT-2的100倍大，且训练数据量也显著增加至570GB。这些模型能够根据给定的输入生成连贯的文本输出，甚至可以回答问题或生成文章摘要等。简而言之，GPT是利用互联网上的海量信息来训练的一种深度学习模型，特别擅长处理自然语言相关的任务。
