In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/Side Projects/HP_chat_bot
%ls

/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot
 bert_tune_for_context_cn.ipynb
 [0m[01;34mchinese_database[0m/
 data_preprocessing_for_bert.ipynb
 [01;34menglish_database[0m/
 find_context.ipynb
 [01;34mfine_tuned_ckiplab_bert_base_chinese[0m/
 harry_potter_chatbot.ipynb
 HP_1_CN.txt
 HP_1_EN.txt
 HP_2_CN.txt
 HP_2_EN.txt
 HP_3_CN.txt
 HP_3_EN.txt
 HP_4_CN.txt
 HP_4_EN.txt
 HP_5_CN.txt
 HP_5_EN.txt
 HP_6_CN.txt
 HP_6_EN.txt
 HP_7_CN.txt
 HP_7_EN.txt
'HP_chatbot_EN&CN.ipynb'
 [01;34mhp_cn_database[0m/
 HP_CN_QA.csv
 HP_CN_RAG_1.ipynb
 HP_CN_RAG_2.ipynb
 HP_CN_RAG_3.ipynb
 HP_CN_RAG_4.ipynb
 HP_CN_RAG_5.ipynb
 HP_CN_RAG.ipynb
 [01;34mhp_en_database[0m/
 HP_EN_QA.csv
 HP_EN_RAG.ipynb
 [01;34mresults[0m/
 [01;34mtrained_ckiplab_bert_base_chinese[0m/
'Transformers Training and Inference on Remote Hardware.ipynb'


In [3]:
%pip install langchain-community
%pip install langchain_openai
%pip install chromadb

Collecting langchain-community
  Downloading langchain_community-0.2.15-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.15 (from langchain-community)
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.37 (from langchain-community)
  Downloading langchain_core-0.2.37-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-community)
  Downloading langsmith-0.1.108-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,

In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Load File
import os
from langchain.document_loaders import TextLoader

# Define the paths for the English Harry Potter book files
english_files = [f'/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot/HP_{i}_EN.txt' for i in range(1, 8)]
documents = []                                    # Initialize an empty list to store all documents
for file_path in english_files:                   # Read the content of all English novel files
    if os.path.exists(file_path):                 # Check if the file exists
        loader = TextLoader(file_path)
        documents.extend(loader.load())           # Add each file's document content to the documents list
    else:
    else:
        print(f"File not found: {file_path}")

print(f'Total documents loaded: {len(documents)}')

Total documents loaded: 7


In [6]:
# Split Data Into Chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(     # Initialize the text splitter with specific parameters
    chunk_size=80,
    chunk_overlap=40,
    length_function=len,
    add_start_index=True)
chunks = text_splitter.split_documents(documents)   # Split the documents into smaller chunks

In [7]:
# Set Up OpenAI API Key
import os
API_KEY='YOUR_PERSONAL_OPENAI_API_KEY'
os.environ['OPENAI_API_KEY'] = API_KEY

In [8]:
# Define a function to process data in batches
MAX_BATCH_SIZE = 41666                              # Adjust as needed
def batchify(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [9]:
# Data Embedding
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

CHROMA_PATH = 'hp_en_database'                      # Define the path for storing the Chroma database

# Initialize Chroma with the first batch of documents and embeddings
db = Chroma.from_documents(
    documents=chunks[:MAX_BATCH_SIZE],              # Initialize with the first batch to create the Chroma instance
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
    persist_directory=CHROMA_PATH
)


# Process the remaining chunks in smaller batches
for batch in batchify(chunks[MAX_BATCH_SIZE:], MAX_BATCH_SIZE):
    db.add_documents(batch)

db.persist()

print(f'Saved {len(chunks)} chunks to {CHROMA_PATH}')

Saved 141098 chunks to hp_en_database


  db.persist()


In [10]:
# Define the prompt template for generating responses
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [11]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Set up the question-answering system
embedding = OpenAIEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding)

# Get user input for a question
query = input('Question: ')

# Perform similarity search to find relevant context
results = db.similarity_search_with_relevance_scores(query, k=5)

# Check if relevant results were found
if len(results) == 0 or results[0][1] < 0.7:
  print('Unable to find matching results')
else:
  # Prepare the context from the search results
  context = '\n\n---\n\n'.join([doc.page_content for doc, score in results])

  # Create a prompt using the template
  prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt = prompt_template.format(context=context, question=query)

  # Initialize the ChatOpenAI model
  model = ChatOpenAI()

  # Generate a response using the model
  response = model.predict(prompt)

  # Extract sources from the search results
  sources = [doc.metadata.get("source", None) for doc, _score in results]

  # Format and print the response with sources
  formatted_response = f"Response: {response}\nSources: {sources}"
  print(formatted_response)


  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding)


Question: What is the name of Albus Dumbledore's phoenix


  response = model.predict(prompt)


Response: Fawkes
Sources: ['/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot/HP_5_EN.txt', '/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot/HP_5_EN.txt', '/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot/HP_7_EN.txt', '/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot/HP_4_EN.txt', '/content/drive/MyDrive/Colab Notebooks/Side Projects/HP_chat_bot/HP_4_EN.txt']
