# 通过问题产生的文档增强RAG

本笔记本通过通过问题生成使用文档扩展来实现增强的RAG方法。通过为每个文本块产生相关问题，我们改进了检索过程，从而从语言模型中提出了更好的回答。

在本实施中，我们遵循以下步骤：

1.数据摄入：从PDF文件中提取文本。
2.块：将文本分成可管理的块。
3.问题生成：为每个块生成相关问题。
4.嵌入创建：为块和问题创建嵌入。
5.矢量存储创建：使用numpy构建一个简单的矢量存储。
6.语义搜索：检索用户查询的相关块和问题。
7.响应生成：基于检索的内容生成答案。
8.评估：评估生成的响应的质量。

In [22]:
import fitz
import os
import numpy as np
import json
from openai import OpenAI
import re
from tqdm import tqdm

## 提取PDF文本

In [23]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and prints the first `num_chars` characters.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = fitz.open(pdf_path)
    all_text = ""  # Initialize an empty string to store the extracted text

    # Iterate through each page in the PDF
    for page_num in range(mypdf.page_count):
        page = mypdf[page_num]  # Get the page
        text = page.get_text("text")  # Extract text from the page
        all_text += text  # Append the extracted text to the all_text string

    return all_text  # Return the extracted text

In [24]:
def chunk_text(text, n, overlap):
    """
    Chunks the given text into segments of n characters with overlap.

    Args:
    text (str): The text to be chunked.
    n (int): The number of characters in each chunk.
    overlap (int): The number of overlapping characters between chunks.

    Returns:
    List[str]: A list of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    
    # Loop through the text with a step size of (n - overlap)
    for i in range(0, len(text), n - overlap):
        # Append a chunk of text from index i to i + n to the chunks list
        chunks.append(text[i:i + n])

    return chunks  # Return the list of text chunks

In [31]:
# Initialize the OpenAI client with the base URL and API key
client = OpenAI(
    base_url="https://api.siliconflow.cn/v1/",
    api_key=os.getenv("SILLICONFLOW_API_KEY")
)

In [26]:
# Generate questions for each chunk
def generate_questions(text_chunk, num_questions=5, model="Qwen/Qwen3-8B"):
    """
    Generates relevant questions that can be answered from the given text chunk.

    Args:
    text_chunk (str): The text chunk to generate questions from.
    num_questions (int): Number of questions to generate.
    model (str): The model to use for question generation.

    Returns:
    List[str]: List of generated questions.
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = "You are an expert at generating relevant questions from text. Create concise questions that can be answered using only the provided text. Focus on key information and concepts."
    
    # Define the user prompt with the text chunk and the number of questions to generate
    user_prompt = f"""
    Based on the following text, generate {num_questions} different questions that can be answered using only this text:

    {text_chunk}
    
    Format your response as a numbered list of questions only, with no additional text.
    """
    
    # Generate questions using the OpenAI API
    response = client.chat.completions.create(
        model=model,
        temperature=0.7,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    
    # Extract and clean questions from the response
    questions_text = response.choices[0].message.content.strip()
    questions = []
    
    # Extract questions using regex pattern matching
    for line in questions_text.split('\n'):
        # Remove numbering and clean up whitespace
        cleaned_line = re.sub(r'^\d+\.\s*', '', line.strip())
        if cleaned_line and cleaned_line.endswith('?'):
            questions.append(cleaned_line)
    
    return questions

In [27]:
#Create Embeddings for chunks and questions
def create_embeddings(text, model="BAAI/bge-m3"):
    """
    Creates embeddings for the given text.

    Args:
    text (str): The input text to be embedded.
    model (str): The embedding model to be used. Default is "BAAI/bge-m3".

    Returns:
    dict: The response containing the embedding for the input text.
    """
    # Create embeddings using the specified model and input text
    response = client.embeddings.create(
        model=model,
        input=text
    )
    # Return the embedding from the response
    return response
    

In [28]:
# Create Vectors Store
class SimpleVectorStore:
    """
    A simple vector store implementation using NumPy.
    """
    def __init__(self):
        """
        Initialize the vector store.
        """
        self.vectors = []
        self.texts = []
        self.metadata = []
    
    def add_item(self, text, embedding, metadata=None):
        """
        Add an item to the vector store.

        Args:
        text (str): The original text.
        embedding (List[float]): The embedding vector.
        metadata (dict, optional): Additional metadata.
        """
        self.vectors.append(np.array(embedding))
        self.texts.append(text)
        self.metadata.append(metadata or {})
    
    def similarity_search(self, query_embedding, k=5):
        """
        Find the most similar items to a query embedding.

        Args:
        query_embedding (List[float]): Query embedding vector.
        k (int): Number of results to return.

        Returns:
        List[Dict]: Top k most similar items with their texts and metadata.
        """
        if not self.vectors:
            return []
        
        # Convert query embedding to numpy array
        query_vector = np.array(query_embedding)
        
        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            similarities.append((i, similarity))
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top k results
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": score
            })
        
        return results

In [29]:
# Process Document
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200, questions_per_chunk=5):
    """
    Process a document with question augmentation.

    Args:
    pdf_path (str): Path to the PDF file.
    chunk_size (int): Size of each text chunk in characters.
    chunk_overlap (int): Overlap between chunks in characters.
    questions_per_chunk (int): Number of questions to generate per chunk.

    Returns:
    Tuple[List[str], SimpleVectorStore]: Text chunks and vector store.
    """
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)
    
    print("Chunking text...")
    text_chunks = chunk_text(extracted_text, chunk_size, chunk_overlap)
    print(f"Created {len(text_chunks)} text chunks")
    
    vector_store = SimpleVectorStore()
    
    print("Processing chunks and generating questions...")
    for i, chunk in enumerate(tqdm(text_chunks, desc="Processing Chunks")):
        # Create embedding for the chunk itself
        chunk_embedding_response = create_embeddings(chunk)
        chunk_embedding = chunk_embedding_response.data[0].embedding
        
        # Add the chunk to the vector store
        vector_store.add_item(
            text=chunk,
            embedding=chunk_embedding,
            metadata={"type": "chunk", "index": i}
        )
        
        # Generate questions for this chunk
        questions = generate_questions(chunk, num_questions=questions_per_chunk)
        
        # Create embeddings for each question and add to vector store
        for j, question in enumerate(questions):
            question_embedding_response = create_embeddings(question)
            question_embedding = question_embedding_response.data[0].embedding
            
            # Add the question to the vector store
            vector_store.add_item(
                text=question,
                embedding=question_embedding,
                metadata={"type": "question", "chunk_index": i, "original_chunk": chunk}
            )
    
    return text_chunks, vector_store

In [None]:
# Define the path to the PDF file
pdf_path = "data/AI_Information.pdf"

# Process the document (extract text, create chunks, generate questions, build vector store)
text_chunks, vector_store = process_document(
    pdf_path, 
    chunk_size=1000, 
    chunk_overlap=200, 
    questions_per_chunk=3
)

print(f"Vector store contains {len(vector_store.texts)} items")

Extracting text from PDF...
Chunking text...
Created 42 text chunks
Processing chunks and generating questions...


Processing Chunks: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [14:49<00:00, 21.18s/it]

Vector store contains 168 items





In [34]:
print("Vector store element -1:", vector_store.texts[-1])
print("Vector store element -1 embedding:", vector_store.vectors[-1])
print("Vector store element -1 metadata:", vector_store.metadata[-1])

Vector store element -1: What qualities are necessary to harness AI's transformative potential according to the text?
Vector store element -1 embedding: [-0.02662256  0.01133184 -0.010896   ... -0.00609268 -0.02836592
  0.02566008]
Vector store element -1 metadata: {'type': 'question', 'chunk_index': 41, 'original_chunk': 'promoting STEM education, providing reskilling and upskilling opportunities, and \nfostering lifelong learning. \nA Human-Centered Approach \nA human-centered approach to AI focuses on developing AI systems that enhance human \ncapabilities, promote well-being, and align with human values. This involves considering the \nethical, social, and psychological impacts of AI and prioritizing human needs and interests. \nBy embracing these principles and working together, we can harness the transformative potential \nof AI to create a more innovative, equitable, and sustainable future. The path forward requires \ndedication, collaboration, and a commitment to responsible AI

In [35]:
# Semantic Search
def semantic_search(query, vector_store, k=5):
    """
    Performs semantic search using the query and vector store.

    Args:
    query (str): The search query.
    vector_store (SimpleVectorStore): The vector store to search in.
    k (int): Number of results to return.

    Returns:
    List[Dict]: Top k most relevant items.
    """
    # Create embedding for the query
    query_embedding_response = create_embeddings(query)
    query_embedding = query_embedding_response.data[0].embedding
    
    # Search the vector store
    results = vector_store.similarity_search(query_embedding, k=k)
    
    return results

# Run An Augmented RAG Query in Vector Store

In [36]:
# Load the validation data from a JSON file
with open('data/val.json') as f:
    data = json.load(f)

query_index = 3
# Extract the first query from the validation data
query = data[query_index]['question']

# Perform semantic search to find relevant content
search_results = semantic_search(query, vector_store, k=5)

print("Query:", query)
print("\nSearch Results:")

# Organize results by type
chunk_results = []
question_results = []

for result in search_results:
    if result["metadata"]["type"] == "chunk":
        chunk_results.append(result)
    else:
        question_results.append(result)

# Print chunk results first
print("\nRelevant Document Chunks:")
for i, result in enumerate(chunk_results):
    print(f"Context {i + 1} (similarity: {result['similarity']:.4f}):")
    print(result["text"][:300] + "...")
    print("=====================================")

# Then print question matches
print("\nMatched Questions:")
for i, result in enumerate(question_results):
    print(f"Question {i + 1} (similarity: {result['similarity']:.4f}):")
    print(result["text"])
    chunk_idx = result["metadata"]["chunk_index"]
    print(f"From chunk {chunk_idx}")
    print("=====================================")

Query: How does AI contribute to personalized medicine?

Search Results:

Relevant Document Chunks:

Matched Questions:
Question 1 (similarity: 0.8071):
What are the benefits of AI in healthcare?
From chunk 24
Question 2 (similarity: 0.8066):
How does AI improve healthcare administration?
From chunk 24
Question 3 (similarity: 0.7760):
How does AI contribute to accelerating drug discovery and development?
From chunk 23
Question 4 (similarity: 0.7685):
In what ways does AI enhance personalized learning in education?
From chunk 7
Question 5 (similarity: 0.7639):
What are the key applications of AI in healthcare as described in the text?
From chunk 23


In [39]:
# Generate context using Augmented RAG, MIND to delete the duplicate chunks
def prepare_context(search_results):
    """
    Prepares a unified context from search results for response generation.

    Args:
    search_results (List[Dict]): Results from semantic search.

    Returns:
    str: Combined context string.
    """
    # Extract unique chunks referenced in the results
    chunk_indices = set()
    context_chunks = []
    
    # First add direct chunk matches
    for result in search_results:
        if result["metadata"]["type"] == "chunk":
            chunk_indices.add(result["metadata"]["index"])
            context_chunks.append(f"Chunk {result['metadata']['index']}:\n{result['text']}")
    
    # Then add chunks referenced by questions
    for result in search_results:
        if result["metadata"]["type"] == "question":
            chunk_idx = result["metadata"]["chunk_index"]
            if chunk_idx not in chunk_indices:
                chunk_indices.add(chunk_idx)
                context_chunks.append(f"Chunk {chunk_idx} (referenced by question '{result['text']}'):\n{result['metadata']['original_chunk']}")
    
    # Combine all context chunks
    full_context = "\n\n".join(context_chunks)
    return full_context

In [43]:
def generate_response(query, context, model_name="Qwen/Qwen3-8B"):
    """
    Generate a response to a query using a given context and model.

    Args:
    query (str): The user's query.
    context (str): The context to be used for response generation.
    model_name (str): The name of the model to be used for response generation.

    Returns:
    str: The generated response.
    """
    sys_prompt = """You are a helpful assistant. You are given a query and a context. 
    Generate a response to the query derived from the given context directly or based on the context. 
    Be concise and accurate.
    If the query is not related to the context, answer with 'I don't have enough information to answer that.'"""
    user_prompt = """Query: {query}
    Context: {context}"""
    
    response = client.chat.completions.create(
        model=model_name,
        temperature=0,
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_prompt.format(query=query, context=context)},
        ],
    )
    return response

In [46]:
context = prepare_context(search_results)
response = generate_response(query, context)

print("Query:", query)
print("Response:", response.choices[0].message.content)
print("Context:", context)



Query: How does AI contribute to personalized medicine?
Response: 

AI contributes to personalized medicine by analyzing individual patient data to predict treatment responses and tailor interventions. This approach enhances treatment effectiveness and reduces adverse effects, as highlighted in the context.
Context: Chunk 24 (referenced by question 'What are the benefits of AI in healthcare?'):
control. These systems enhance dexterity, reduce invasiveness, and 
improve patient outcomes. 
Healthcare Administration 
AI streamlines healthcare administration by automating tasks, managing patient records, and 
optimizing workflows. AI-powered systems improve efficiency, reduce costs, and enhance 
patient experience. 
Chapter 12: AI and Cybersecurity 
Threat Detection and Prevention 
AI enhances cybersecurity by detecting and preventing threats, analyzing network traffic, and 
identifying vulnerabilities. AI-powered systems automate security tasks, improve threat 
detection accuracy, and enh

# Evaluation the answer to the query

In [49]:
sys_prompt = """
You are an intelligent evaluation system tasked with assessing the AI assistant's responses.
If the AI assistant's response is very close to the true response, assign a score of 1.
If the response is incorrect or unsatisfactory in relation to the true response, assign a score of 0. 
If the response is partially aligned with the true response, assign a score of 0.5.
"""
ideal_response = data[query_index]['ideal_answer']
generated_response = response.choices[0].message.content

user_prompt = f"""Query: {query}
Ideal Response: {ideal_response}
AI Assistant's Response: {generated_response}"""

evaluation_response = client.chat.completions.create(
    model="Qwen/Qwen3-8B",
    temperature=0,
    messages=[
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt}
    ]
)
print("query: ", query)
print("ideal_response: ", ideal_response)
print("generated_response: ", generated_response)
print("evaluation: ", evaluation_response.choices[0].message.content)



query:  How does AI contribute to personalized medicine?
ideal_response:  AI enables personalized medicine by analyzing individual patient data, predicting treatment responses, and tailoring interventions to specific needs. This enhances treatment effectiveness and reduces adverse effects.
generated_response:  

AI contributes to personalized medicine by analyzing individual patient data to predict treatment responses and tailor interventions. This approach enhances treatment effectiveness and reduces adverse effects, as highlighted in the context.
evaluation:  

1
