<a href="https://colab.research.google.com/github/Young931127/ai-dream-analyzer/blob/main/final_project_vectorDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###安裝必要套件

In [None]:
!pip install -q langchain langchain-community pypdf python-docx faiss-cpu
!pip install -q sentence-transformers transformers huggingface_hub unstructured

In [None]:
import os
import shutil
from google.colab import files
from huggingface_hub import login
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

###讀入金鑰

In [None]:
try:
    from google.colab import userdata
    hf_token = userdata.get('HuggingFace')
    login(token=hf_token)
except:
    print("cannot find secret key. Please enter token manually.")
    login()

###上傳檔案

In [None]:
upload_dir = "uploaded_docs"

# 如果資料夾存在，先清空舊的(避免混到舊資料)
if os.path.exists(upload_dir):
    shutil.rmtree(upload_dir)
os.makedirs(upload_dir, exist_ok=True)

print(f"\nTab the button to upload your files")
uploaded = files.upload()

if not uploaded :
    print("error：沒有上傳任何檔案。")
else:
    # 將上傳的檔案移動到uploaded_docs資料夾
    for filename in uploaded.keys():
        # 來源路徑 (Colab 根目錄)
        src_path = filename
        # 目的路徑 (uploaded_docs 資料夾)
        dst_path = os.path.join(upload_dir, filename)
        # 移動檔案
        shutil.move(src_path, dst_path)
        print(f"檔案已移動至: {dst_path}")


Tab the button to upload your files


Saving AIdream_data.txt to AIdream_data.txt
檔案已移動至: uploaded_docs/AIdream_data.txt


###Define embedding model(Gemma)

In [None]:
class EmbeddingGemmaEmbeddings(HuggingFaceEmbeddings):
    def __init__(self, **kwargs):
        super().__init__(
            model_name="google/embeddinggemma-300m",
            encode_kwargs={"normalize_embeddings": True},
            **kwargs
        )

    def embed_documents(self, texts):
        texts = [f'title: none | text: {t}' for t in texts]
        return super().embed_documents(texts)

    def embed_query(self, text):
        return super().embed_query(f'task: search result | query: {text}')

embedding_model = EmbeddingGemmaEmbeddings()

###讀取與切割文件

In [None]:
documents = []
for file in os.listdir(upload_dir):
    path = os.path.join(upload_dir, file)
    print(f"正在處理: {path}")

    if file.endswith(".txt"):
        # encoding="utf-8"避免讀取中文報錯
        loader = TextLoader(path, encoding="utf-8")
        documents.extend(loader.load())
    elif file.endswith(".pdf"):
        loader = PyPDFLoader(path)
        documents.extend(loader.load())
    elif file.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(path)
        documents.extend(loader.load())

#切割文字 (chunk_size 可以根據夢境內容長短調整，200大約是一段完整的解釋)
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
split_docs = splitter.split_documents(documents)

正在處理: uploaded_docs/AIdream_data.txt


###建立vectorDB

In [None]:
if len(split_docs) > 0:
    vectorstore = FAISS.from_documents(split_docs, embedding_model) #建立向量資料庫

    # 儲存向量資料庫
    db_folder_name = "faiss_db"
    vectorstore.save_local(db_folder_name)

    # 壓縮並上傳到雲端
    DRIVE_PATH = "/content/drive/MyDrive/Dream"
    if not os.path.exists(DRIVE_PATH):
        os.makedirs(DRIVE_PATH)

    print("正在壓縮並備份到雲端...")
    shutil.make_archive("dream_db", 'zip', db_folder_name)
    shutil.copy("dream_db.zip", f"{DRIVE_PATH}/dream_db.zip")

    print(f"資料庫已備份至：{DRIVE_PATH}/dream_db.zip")
else:
    print("error：沒有讀到任何資料，請確認有沒有上傳檔案。")

正在壓縮並備份到雲端...
資料庫已備份至：/content/drive/MyDrive/Dream/dream_db.zip
