In [1]:
!pip install "unstructured[md]" chromadb



You should consider upgrading via the 'D:\Code\PromptEngineering_Langchain\env\Scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
from langchain.document_loaders import DirectoryLoader

DATA_PATH = "data/"

def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def split_text(documents: list[Document]):
    """Split the text of the documents into chunks.
    Args:
        documents (list[Document]): The documents to split.
    Returns:
        chunks: The chunks of the documents.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [4]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil

CHROMA_PATH = "chroma"

def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [5]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

generate_data_store()

Split 1 documents into 801 chunks.
So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
picking the daisies, when suddenly a White Rabbit with pink eyes ran
close by her.
{'source': 'data\\alice_in_wonderland.md', 'start_index': 1654}
Saved 801 chunks to chroma.


# Embeddings

In [8]:
from langchain.evaluation import load_evaluator

def embedding_evaluation():
    # Get embedding for a word.
    embedding_function = OpenAIEmbeddings()
    vector = embedding_function.embed_query("apple")
    print(f"Vector for 'apple': {vector}")
    print(f"Vector length: {len(vector)}")

    # Compare vector of two words
    evaluator = load_evaluator("pairwise_embedding_distance")
    words = ("apple", "orange")
    x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
    print(f"Comparing ({words[0]}, {words[1]}): {x}")

embedding_evaluation()



Vector for 'apple': [0.007754413596091041, -0.02315402458734137, -0.007501849152375844, -0.0277684467952731, -0.004570052602434246, 0.012996834081654635, -0.02202089555963392, -0.00840971638961714, 0.018935513203263084, -0.02962513711227445, -0.0029403288619125724, 0.02016420524263257, -0.004402814072396592, 0.009099149362349734, -0.02169324509419347, 0.0020512335957638896, 0.030690003918923712, 0.00010303096742397732, 0.0020119837034118983, -0.025461234759984637, -0.02107889814318613, -0.008163977609214203, 0.021324636923589064, -0.012484878599589383, 0.0011339806058069373, 0.005088833747811759, 0.01014353731641702, 8.900553287787876e-05, 0.015959347047824, -0.012949051644501018, 0.020587422445025448, -0.016109520953919706, -0.018457688831726927, 0.005460854750072293, -0.019290468184597773, -0.009194714236656966, -0.012054836665206842, -0.00872371552843307, -0.005669049588290005, -0.006122983206910652, 0.010512145487021424, 0.007658848721783809, -0.006385787076916837, 0.00072057686603