In [1]:
#!pip install -r requirements.txt

In [2]:
#!pip install -U sentence-transformers

In [3]:
#!pip install -U  langchain_huggingface 

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from typing import List
from langchain.schema import Document
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma


In [5]:
model_path = 'sentence-transformers/all-MiniLM-L6-v2'
DATA_PATH = 'data/books'
CHROMA_PATH = "chroma"

In [6]:
embeddings = HuggingFaceEmbeddings(model_name=model_path)

In [7]:
class MyEmbeddings:
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(t).tolist() for t in texts]

In [8]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

In [9]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [10]:
documents = load_documents()

In [11]:
chunks = split_text(documents)

Split 1 documents into 801 chunks.
So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
picking the daisies, when suddenly a White Rabbit with pink eyes ran
close by her.
{'source': 'data/books/alice_in_wonderland.md', 'start_index': 1654}


In [12]:
db = Chroma.from_documents(chunks, embedding=embeddings,persist_directory="./chroma_db")
retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 20},
    persist_directory="./chroma_db" #директория в которой будут размещаться фаилы
)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [13]:
db.similarity_search(query = 'How did Alice meet the Mad Hatter?')

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


[Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 82827}, page_content='“Really, now you ask me,” said Alice, very much confused, “I don’t\nthink—”\n\n“Then you shouldn’t talk,” said the Hatter.'),
 Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 82827}, page_content='“Really, now you ask me,” said Alice, very much confused, “I don’t\nthink—”\n\n“Then you shouldn’t talk,” said the Hatter.'),
 Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 70057}, page_content='Alice waited a little, half expecting to see it again, but it did not\nappear, and after a minute or two she walked on in the direction in\nwhich the March Hare was said to live. “I’ve seen hatters before,” she\nsaid to herself; “the March Hare will be much the most interesting, and'),
 Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 70057}, page_content='Alice waited a little, half expecting to see 

In [14]:
db3 =  Chroma( persist_directory="./chroma_db",embedding_function=embeddings)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [15]:
db3.similarity_search(query = 'How did Alice meet the Mad Hatter?', )

[Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 82827}, page_content='“Really, now you ask me,” said Alice, very much confused, “I don’t\nthink—”\n\n“Then you shouldn’t talk,” said the Hatter.'),
 Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 82827}, page_content='“Really, now you ask me,” said Alice, very much confused, “I don’t\nthink—”\n\n“Then you shouldn’t talk,” said the Hatter.'),
 Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 70057}, page_content='Alice waited a little, half expecting to see it again, but it did not\nappear, and after a minute or two she walked on in the direction in\nwhich the March Hare was said to live. “I’ve seen hatters before,” she\nsaid to herself; “the March Hare will be much the most interesting, and'),
 Document(metadata={'source': 'data/books/alice_in_wonderland.md', 'start_index': 70057}, page_content='Alice waited a little, half expecting to see 