<a href="https://colab.research.google.com/github/alicetw40342/Building-a-PDF-Based-Question-Answering-System-with-RAG-and-Gemini/blob/main/Project3_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ 安裝必要套件
!pip install -q sentence-transformers chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m61.4/67.3 kB[0m [31m106.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m7.4 MB/

In [None]:
# ✅ 匯入套件與初始化模型
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
import chromadb

# 使用輕量快速的本地模型
local_model = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
# ✅ 定義本地嵌入工具與距離計算
class EmbeddingUtils:
    @staticmethod
    def get_embedding(text: str, model="local") -> List[float]:
        return local_model.encode(text).tolist()

    @staticmethod
    def distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine") -> List[float]:
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        distances = [1 - sim for sim in similarities]
        return distances

    @staticmethod
    def indices_of_nearest_neighbors_from_distances(distances: List[float]) -> List[int]:
        return sorted(range(len(distances)), key=lambda i: distances[i])


In [None]:
# ✅ 建立 Chroma 向量資料庫並填入嵌入資料
def create_chroma_db(documents: List[str], path: str, name: str):
    """
    使用本地模型取得嵌入，並存入 Chroma 向量資料庫
    """
    client = chromadb.PersistentClient(path=path)
    collection = client.get_or_create_collection(name=name)

    utils = EmbeddingUtils()
    embeddings = [utils.get_embedding(doc) for doc in documents]
    ids = [f"doc_{i}" for i in range(len(documents))]

    collection.add(
        documents=documents,
        embeddings=embeddings,
        ids=ids
    )

    return collection


In [None]:
# ✅ 測試資料與執行結果
test_chunks = [
    "Artificial intelligence is transforming the world.",
    "Machine learning is a technique in AI.",
    "Neural networks mimic the human brain."
]

db_path = "/content/chroma_db"
collection_name = "pdf_chunks"

collection = create_chroma_db(test_chunks, db_path, collection_name)
print("✅ 向量資料庫建立成功，共儲存筆數：", collection.count())
