In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/Side Projects/HP_chat_bot
%ls

/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot
 bert_tune_for_context_cn.ipynb
 [0m[01;34mchinese_database[0m/
 data_preprocessing_for_bert.ipynb
 [01;34menglish_database[0m/
 find_context.ipynb
 [01;34mfine_tuned_ckiplab_bert_base_chinese[0m/
 harry_potter_chatbot.ipynb
 HP_1_CN.txt
 HP_1_EN.txt
 HP_2_CN.txt
 HP_2_EN.txt
 HP_3_CN.txt
 HP_3_EN.txt
 HP_4_CN.txt
 HP_4_EN.txt
 HP_5_CN.txt
 HP_5_EN.txt
 HP_6_CN.txt
 HP_6_EN.txt
 HP_7_CN.txt
 HP_7_EN.txt
 [01;34mhp_cn_database[0m/
 [01;34mhp_cn_database_4cc0126e18294303a4868ed7c6ba5ccf[0m/
 HP_CN_QA.csv
 HP_CN_RAG_1.ipynb
 HP_CN_RAG_2.ipynb
 HP_CN_RAG_3.ipynb
 HP_CN_RAG_4.ipynb
 HP_CN_RAG_5.ipynb
 HP_CN_RAG.ipynb
 [01;34mhp_en_database[0m/
 HP_EN_QA.csv
 HP_EN_RAG.ipynb
 [01;34mresults[0m/
 [01;34mtrained_ckiplab_bert_base_chinese[0m/
'Transformers Training and Inference on Remote Hardware.ipynb'


In [2]:
%pip install langchain-community
%pip install langchain_openai
%pip install transformers
%pip install torch

Collecting langchain-community
  Downloading langchain_community-0.2.15-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.15 (from langchain-community)
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.37 (from langchain-community)
  Downloading langchain_core-0.2.37-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-community)
  Downloading langsmith-0.1.108-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,

In [3]:
import os
import shutil
import torch
import numpy as np
from typing import List
from transformers import AutoTokenizer, AutoModel
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.embeddings.base import Embeddings
from langchain.prompts import ChatPromptTemplate
from tqdm import tqdm
import time

In [40]:
# Mount Google Drive to access files
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [41]:
# Set Up OpenAI API Key
API_KEY='YOUR_PERSONAL_OPENAI_API_KEY'
os.environ['OPENAI_API_KEY'] = API_KEY

In [42]:
# Define a custom embedding class using Microsoft's multilingual model
class MicrosoftEmbeddings(Embeddings):
    def __init__(self):
        # Initialize the tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")
        self.model = AutoModel.from_pretrained("intfloat/multilingual-e5-small").to(device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # Embed a list of texts using the model
        all_embeddings = []
        batch_size = 32
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding documents"):
            batch_texts = texts[i:i+batch_size]
            inputs = self.tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embeddings.extend(embeddings.tolist())
        return all_embeddings

    def embed_query(self, text: str) -> List[float]:
        # Embed a single query
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        return embeddings[0].tolist()

In [43]:
# Define a class for OpenAI embeddings with retry mechanism
class OpenAIEmbeddingsWithRetry(OpenAIEmbeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        try:
            return super().embed_documents(texts)
        except Exception as e:
            print(f"Error in embedding: {e}")
            time.sleep(60)
            return super().embed_documents(texts)

    def embed_query(self, text: str) -> List[float]:
        try:
            return super().embed_query(text)
        except Exception as e:
            print(f"Error in embedding query: {e}")
            time.sleep(60)
            return super().embed_query(text)

In [44]:
# Function to load documents
def load_documents():
    cn_files = [f'HP_{i}_CN.txt' for i in range(1, 8)]
    # Initialize an empty list to store all documents
    documents = []
    for file_path in cn_files:
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                doc = Document(page_content=content, metadata={"source": file_path})
                documents.append(doc)
        else:
            print(f"File not found: {file_path}")
    return documents

In [45]:
# Function to process and store documents in Chroma database
def process_and_store_documents(embedding_function, force_recreate=False):
    # Create database path
    CHROMA_PATH = 'hp_cn_database'

    # Check if database is already existed
    if os.path.exists(CHROMA_PATH) and not force_recreate:
        print("Loading existing database...")
        return Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function), CHROMA_PATH

    print("Creating new database...")
    # Delete old database if need to rebuild database
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new Chroma database
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,                                             # Increase chunk_size to include more chinese sentence
        chunk_overlap=100,                                          # Increase overlap to keep consistency of context
        length_function=len,
        separators=["\n\n", "\n", "。", "！", "？", "；", " ", ""],  # Add common chinese punctuation as separators
        keep_separator=True,
        add_start_index=True
    )

    # Load documents
    documents = load_documents()
    # Split Data Into Chunks
    chunks = text_splitter.split_documents(documents)

    if not chunks:
        raise ValueError("No documents to process. Check your document loading and splitting.")

    print(f"Total chunks to process: {len(chunks)}")


    # Initialize db
    db = None
    batch_size = 100
    for i in tqdm(range(0, len(chunks), batch_size), desc="Processing document chunks"):
        batch_chunks = chunks[i:i+batch_size]
        try:
            if db is None:
                db = Chroma.from_documents(batch_chunks, embedding_function, persist_directory=CHROMA_PATH)
            else:
                db.add_documents(batch_chunks)
        except Exception as e:
            print(f"Error processing batch {i}-{i+batch_size}: {e}")
            time.sleep(60)
        time.sleep(1)

    if db is None:
        raise ValueError("Failed to create database. Check your embedding function and document processing.")

    db.persist()
    return db, CHROMA_PATH


In [46]:
# Function to search and generate responses
def search_and_generate(query, db):
    results = db.similarity_search_with_relevance_scores(query, k=5)
    context = "\n".join([doc.page_content for doc, _ in results])

    chat_model = ChatOpenAI(model_name="gpt-4o-mini")
    prompt_template = ChatPromptTemplate.from_template(
        "根據以下上下文以繁體中文回答問題：\n\n{context}\n\n問題：{query}\n\n請確保回答與哈利波特的故事情節一致，並且只基於给定的上下文。"
    )
    prompt = prompt_template.format(context=context, query=query)
    response = chat_model.invoke(prompt)

    return response.content

In [48]:
def main():
    print("Initializing...")

    # Check if need to rebuild Chroma database
    force_recreate = input("Do you want to recreate the database? (y/n): ").lower().strip() == 'y'

    # Choose embedding model
    embedding_choice = input("Choose embedding model (openai/microsoft): ").lower().strip()
    if embedding_choice == "openai":
        embedding_function = OpenAIEmbeddingsWithRetry(model="text-embedding-3-small")
    elif embedding_choice == "microsoft":
        embedding_function = MicrosoftEmbeddings()
    else:
        print("Invalid choice. Using Microsoft embeddings by default.")
        embedding_function = MicrosoftEmbeddings()

    db, db_path = process_and_store_documents(embedding_function, force_recreate)
    print("Database ready. Starting conversation...")

    # Keep asking question until input 'quit'
    while True:
        query = input("請輸入您的問題（輸入 'quit' 退出）: ")
        if query.lower() == 'quit':
            break
        answer = search_and_generate(query, db)
        print(f"回答: {answer}\n")

    # Keep Chroma database if needed
    keep_db = input("Do you want to keep the database for future use? (y/n): ").lower().strip() == 'y'
    if not keep_db:
        db.delete_collection()
        if os.path.exists(db_path):
            shutil.rmtree(db_path)
        print("Database cleaned up.")
    else:
        print("Database retained for future use.")

if __name__ == "__main__":
    main()

Initializing...
Do you want to recreate the database? (y/n): n
Choose embedding model (openai/microsoft): openai


  return Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function), CHROMA_PATH


Loading existing database...
Database ready. Starting conversation...
请输入您的问题（输入 'quit' 退出）: 哈利在三巫鬥法大賽中，吃了什麼讓他長出了腮，因而可以在水中游泳？
回答: 哈利在三巫鬥法大賽中吃了「海洋草」，這使得他長出了腮，能夠在水中游泳。

请输入您的问题（输入 'quit' 退出）: 傳說中的混血王子是誰？
回答: 傳說中的混血王子是塞弗勒斯·斯內普（Severus Snape）。他在《哈利·波特》系列中被稱為混血王子，因為他的父親是麻瓜，而母親則是巫師。這個稱號源於他在學校時期的身份以及他的血統背景。

请输入您的问题（输入 'quit' 退出）: 誰教會了哈利·波特護法咒？
回答: 哈利·波特是由小天狼星·布萊克教會護法咒的。

请输入您的问题（输入 'quit' 退出）: 霍格華茲各學院的學院杯分數是用什麼記錄的？
回答: 霍格華茲各學院的學院杯分數是用學院的獎勵和懲罰系統來記錄的。學生可以通過獲得優秀成績、參加競賽或表現良好來獲得分數，而違規或不當行為則會導致扣分。這些分數會隨著學年的進行而累積，最終決定哪一個學院獲得學院杯。

请输入您的问题（输入 'quit' 退出）: 小天狼星·布萊克的外號有人知道嗎？
回答: 小天狼星·布萊克的外號是「小天狼星」。在《哈利·波特》系列中，他是一位著名的巫師，曾經是個追隨者的「死神」之一，後來成為哈利·波特的教父。

请输入您的问题（输入 'quit' 退出）: 誰終結了娜吉妮？
回答: 在哈利波特的故事中，娜吉妮是伏地魔的仆從和一條蛇。終結娜吉妮的是哈利·波特，他在最後的戰鬥中使用了榮恩·衛斯理的劍，成功地殺死了她。

请输入您的问题（输入 'quit' 退出）: 霍格華茲女生盥洗室裡游蕩著哪位幽靈？
回答: 霍格華茲女生盥洗室裡游蕩著的是「悲傷的女鬼」（Moaning Myrtle）。

请输入您的问题（输入 'quit' 退出）: 哈利的第一根魔杖杖芯是什麼？
回答: 哈利的第一根魔杖杖芯是鳳凰羽毛。

请输入您的问题（输入 'quit' 退出）: 孚立維教授用什麼方法保護魔法石？
回答: 孚立維教授用了多重魔法來保護魔法石，包括設置一些具有挑戰性的魔法障礙和守護生物。他還使用了他的專業知識設計出了一系列的考驗，只有具備