In [None]:
!pip -q install langchain openai tiktoken chromadb lark
!pip -q install sentence_transformers
!pip -q install -U FlagEmbedding

In [None]:
!pip show langchain

In [96]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever

## Text Splitting & Docloader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings


In [100]:

bge_embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5", 
encode_kwargs={"normalize_embeddings": True})

In [101]:
loaders =  [
    TextLoader("./data/data.txt")
]
docs = []
for l in loaders:
    docs.extend(l.load())
len(docs)
docs




In [102]:
len(docs)

1

## Retrieving larger chunks
Sometimes, the full documents can be too big to want to retrieve them as is. In that case, what we really want to do is to first split the raw documents into larger chunks, and then split it into smaller chunks. We then index the smaller chunks, but on retrieval we retrieve the larger chunks (but still not the full documents).

In [103]:
# This text splitter is used to create the parent documents - The big chunks
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

# This text splitter is used to create the child documents - The small chunks
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="split_parents", embedding_function=bge_embeddings)

# The storage layer for the parent documents
store = InMemoryStore()

In [104]:
big_chunks_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [105]:
big_chunks_retriever.add_documents(docs)

In [106]:
len(list(store.yield_keys()))

140

In [115]:
sub_docs = vectorstore.similarity_search("what is scytale")

In [116]:
len(sub_docs)

4

In [117]:
print(sub_docs[0].page_content)

Give a trivia or a fun fact about Scytale. How do I pronounce the word Scytale? ,"Scytale is actually the stick for transposition cipher. In Ancient Greek, it means a 'baton' or a 'cylinder'. It is pronounced as ""skit-uh-lee""."


In [118]:
retrieved_docs = big_chunks_retriever.invoke("what is scytale")

In [119]:
len(retrieved_docs)

1

In [120]:
len(retrieved_docs[0].page_content)

1727

In [121]:
print(retrieved_docs[0].page_content)

Log Data is visualized by Grafana (Prometheus and Elasticsearch as data sources)."
Give me some trivia and fun facts about Xmidt.,Xmidt cloud and associated components are named after Greek gods or items of the Greek gods
Tell me a fun fact about Parodus. Is Parodus a Greek word?,"Parodus is actually an ancient greek word for ""entrance""."
Tell me a Fun fact about Talaria. Is it related to any Greek God in any way?,"Talaria were actually the winged sandals, a symbol of the Greek messenger god Hermes. It is pronounced as ""tuh-laa-ree-uh""."
Give a trivia or a fun fact about Scytale. How do I pronounce the word Scytale? ,"Scytale is actually the stick for transposition cipher. In Ancient Greek, it means a 'baton' or a 'cylinder'. It is pronounced as ""skit-uh-lee""."
"How does one pronounce ""Petasos""? Also, tell me something interesting about Petasos.","Petasos in Greek legends was actually the funky hat worn by the Greek messenger god Hermes, and is pronounced as ""pet-uh-sos""."
Wh

In [93]:
print(retrieved_docs[1].page_content)

IndexError: list index out of range

In [122]:
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

qa = RetrievalQA.from_chain_type(llm = Ollama(model="brutus-no-sys"),
                                 chain_type="stuff",
                                 retriever=big_chunks_retriever)

In [125]:
query = "what is talaria do?"
qa.run(query)

'Talaria is a critical service within Xmidt Cloud that enables communication between clients (devices/routers/CPEs) and the backend. Its primary function is to manage millions of device connections, forward device events, and send/receive requests from devices using secure WebSocket connections. Additionally, Talaria utilizes WRP messages for efficient communication within Xmidt Cloud. It acts as a gatekeeper for device access to Xmidt Cloud services through JWT (JSON Web Token) authorization.'