In [13]:
# General import
import os
import shutil

# 3rd-party import
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Local import

In [14]:
DATA_PATH = "../data"

def load_doc() -> list[Document]:
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    doc = loader.load()
    return doc

In [15]:
def split_txt(doc: list[Document]) -> list[Document]:
    txt_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True,
    )

    chunks = txt_splitter.split_documents(doc)
    print(f"Splitted {len(doc)} documents into {len(chunks)} chunks.")

    # Show random chunk content and its metadata
    # print(chunks[3].page_content) 
    # print(chunks[3].metadata) 

    return chunks

In [16]:
CHROMA_PATH = "chroma"

def save_to_chroma(chunks: list[Document]) -> None: 
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH) # clear the db

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [17]:
def generate_data_score() -> None:
    doc = load_doc()
    chunks = split_txt(doc)
    save_to_chroma(chunks)

In [18]:
generate_data_score()

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


Splitted 2 documents into 19 chunks.
Saved 19 chunks to chroma.


In [None]:
def check_database_content():
    """
    Check what documents are stored in the Chroma database.
    """
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    
    # Get collection info
    collection = db._collection
    print(f"Database has {collection.count()} documents")
    
    # Get a few sample documents
    results = collection.get(include=["documents", "metadatas"])
    if results and results['documents']:
        print("\nSample documents in database:")
        for i, doc in enumerate(results['documents'][:3]):  # Show first 3
            print(f"Document {i+1}: {doc[:200]}...")
            if i < len(results['metadatas']):
                print(f"Metadata: {results['metadatas'][i]}")
            print("-" * 50)
    else:
        print("No documents found in the database!")


In [20]:

check_database_content()

Database has 19 documents

Sample documents in database:
Document 1: Room & Accommodation 1. Q: What types of rooms available? A: Premium 1-Bedroom Premium 2-Bedroom Premium 3-Bedroom Presidential Suites Signature 2-Bedroom Suites Signature 2-Bedroom Signature 1 Bedroo...
Metadata: {'source': '..\\data\\FAQ.txt', 'start_index': 0}
--------------------------------------------------
Document 2: 5. Q: Are Wi-Fi provide in room? A: Yes, under complimentary basis

6. Q: Are toiletries provided? A: Yes, basic toiletries such as body wash, shampoo, and toilet paper are provided.

7. Q: Do you pro...
Metadata: {'source': '..\\data\\FAQ.txt', 'start_index': 909}
--------------------------------------------------
Document 3: 8. Q: Do you have a room with balcony or any view? A: Selected units offer stunning city or KL Eco City views. Please check with us during reservation for availability.

9. Q: Do you provide extra bed...
Metadata: {'start_index': 1269, 'source': '..\\data\\FAQ.txt'}
--------