<a href="https://colab.research.google.com/github/alicetw40342/Building-a-PDF-Based-Question-Answering-System-with-RAG-and-Gemini/blob/main/Project3_task4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from typing import List

def make_rag_prompt(query: str, relevant_passage: List[str]) -> str:
    """
    將使用者問題與相關段落整合成生成模型的 prompt
    """
    context = "\n\n".join(relevant_passage)
    prompt = (
        f"Based on the following information:\n\n"
        f"{context}\n\n"
        f"Please answer the following question:\n{query}"
    )
    return prompt


In [4]:
# 假設你剛從 get_relevant_passage() 拿到以下段落
retrieved_passages = [
    "Artificial intelligence is transforming the world.",
    "Neural networks are inspired by the structure of the human brain."
]

# 使用者的問題
user_query = "How is AI connected to how the human brain works?"

# 建立 prompt
prompt = make_rag_prompt(user_query, retrieved_passages)

# 顯示 prompt
print("📝 生成的 Prompt：\n")
print(prompt)


📝 生成的 Prompt：

Based on the following information:

Artificial intelligence is transforming the world.

Neural networks are inspired by the structure of the human brain.

Please answer the following question:
How is AI connected to how the human brain works?


In [None]:
!pip install -q sentence-transformers chromadb PyPDF2


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m7.3 MB/s[0m eta [36m0:0

In [None]:
import re
from typing import List
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() for page in reader.pages)

def split_text(text: str, max_chunk_size: int = 500) -> List[str]:
    paragraphs = re.split(r'\n\s*\n', text)
    chunks = []

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        if len(para) > max_chunk_size:
            sentences = re.split(r'(?<=[.!?]) +', para)
            chunk = ""
            for sentence in sentences:
                if len(chunk) + len(sentence) <= max_chunk_size:
                    chunk += " " + sentence
                else:
                    chunks.append(chunk.strip())
                    chunk = sentence
            if chunk:
                chunks.append(chunk.strip())
        else:
            chunks.append(para)
    return chunks


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import chromadb

# 初始化本地模型
local_model = SentenceTransformer("all-MiniLM-L6-v2")

class EmbeddingUtils:
    @staticmethod
    def get_embedding(text: str, model="local") -> List[float]:
        return local_model.encode(text).tolist()

    @staticmethod
    def distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine") -> List[float]:
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        return [1 - sim for sim in similarities]

    @staticmethod
    def indices_of_nearest_neighbors_from_distances(distances: List[float]) -> List[int]:
        return sorted(range(len(distances)), key=lambda i: distances[i])

def create_chroma_db(documents: List[str], path: str, name: str):
    client = chromadb.PersistentClient(path=path)
    collection = client.get_or_create_collection(name=name)
    utils = EmbeddingUtils()
    embeddings = [utils.get_embedding(doc) for doc in documents]
    ids = [f"doc_{i}" for i in range(len(documents))]
    collection.add(documents=documents, embeddings=embeddings, ids=ids)
    return collection


In [None]:
def get_relevant_passage(query: str, db, n_results: int = 3) -> List[str]:
    utils = EmbeddingUtils()
    query_embedding = utils.get_embedding(query)
    result = db.query(query_embeddings=[query_embedding], n_results=n_results)
    return result["documents"][0]


In [None]:
def make_rag_prompt(query: str, relevant_passage: List[str]) -> str:
    context = "\n\n".join(relevant_passage)
    prompt = (
        f"Based on the following information:\n\n"
        f"{context}\n\n"
        f"Please answer the following question:\n{query}"
    )
    return prompt


In [None]:
# 模擬文字片段
chunks = [
    "Artificial intelligence is transforming the world.",
    "Machine learning is a technique in AI.",
    "Neural networks mimic the human brain."
]

# 建立資料庫
db_path = "/content/chroma_db"
collection_name = "pdf_chunks"
collection = create_chroma_db(chunks, db_path, collection_name)

# 問題與檢索
query = "What is the connection between AI and the brain?"
relevant = get_relevant_passage(query, collection, n_results=2)
prompt = make_rag_prompt(query, relevant)

# 顯示結果
print("🔍 問題：", query)
print("\n📚 相關段落：")
for i, p in enumerate(relevant): print(f"Chunk {i+1}: {p}")
print("\n📝 Prompt：\n", prompt)
