# Q&A Langchain with ChromaDB

In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
import os

In [2]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'token'
file_name = 'state_of_the_union.txt'
max_num_of_tokens = 2048
loader = TextLoader(file_name)
query = "What did the president say about Ketanji Brown Jackson?"
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator='\n')
splited_docs = text_splitter.split_documents(documents)
print('Number of chuncks ', len(splited_docs))

Number of chuncks  95


In [3]:
llm_embeddings = HuggingFaceHubEmbeddings()
flan_ul2 = HuggingFaceHub(repo_id="google/flan-ul2", model_kwargs={"temperature":0.1, "max_new_tokens":300})

  from .autonotebook import tqdm as notebook_tqdm
You're using a different task than the one specified in the repository. Be sure to know what you're doing :)


In [4]:
persist_directory = 'db_' + file_name
db = None

if os.path.isdir(persist_directory):
    db = Chroma(persist_directory=persist_directory, embedding_function=llm_embeddings)
else:
    db = Chroma.from_documents(splited_docs, llm_embeddings, persist_directory=persist_directory)
    db.persist()


Using embedded DuckDB with persistence: data will be stored in: db_state_of_the_union.txt


In [None]:
# Opensearch
#docsearch = OpenSearchVectorSearch.from_documents(docs, llm_embeddings, opensearch_url="http://localhost:9200")
#docs = docsearch.similarity_search(query)

In [5]:
# Chroma
#response_docs = db.similarity_search(query)
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=flan_ul2, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [6]:
result = qa({"query": query})
result['result']

'One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence'

In [9]:
len(result['source_documents'])

4

# Q&A Langchain with OpenSearch

In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
import os

In [2]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'token'
file_name = 'docs/state_of_the_union.txt'
max_num_of_tokens = 2048
loader = TextLoader(file_name)
query = "What did the president say about Ketanji Brown Jackson?"
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50, separator='\n')
splited_docs = text_splitter.split_documents(documents)


Created a chunk of size 305, which is longer than the specified 300
Created a chunk of size 333, which is longer than the specified 300
Created a chunk of size 325, which is longer than the specified 300


In [3]:
CHUNK_SIZE = 300
CHUNK_OVERLAP = 50

embeddings = HuggingFaceHubEmbeddings()
llm = HuggingFaceHub(repo_id="OpenAssistant/oasst-sft-1-pythia-12b", model_kwargs={"temperature":0.01, "max_new_tokens":300})


  from .autonotebook import tqdm as notebook_tqdm
You're using a different task than the one specified in the repository. Be sure to know what you're doing :)


In [10]:
OPENSEARCH_URL = 'https://admin:admin@192.168.0.26:9200'


OpenSearchVectorSearch.from_documents(splited_docs, embeddings, opensearch_url=OPENSEARCH_URL,
                                              index_name='state_of_the_union', verify_certs = False)



<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x15ad74550>

In [14]:
db = OpenSearchVectorSearch(index_name='state_of_the_union', embedding_function=embeddings,
                                opensearch_url=OPENSEARCH_URL, verify_certs = False)

In [15]:
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [16]:
result = qa({"query": 'What did the president say about Ketanji Brown Jackson?'})



In [17]:
result['result']

' President Zelenskyy mentioned Ketanji Brown Jackson during his press conference on July 25th. He praised her for her legal expertise and experience, and said that she would make a great judge on the Circuit Court of Appeals.'