In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
import os

In [2]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'token'
file_name = 'state_of_the_union.txt'
max_num_of_tokens = 2048
loader = TextLoader(file_name)
query = "What did the president say about Ketanji Brown Jackson?"
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator='\n')
splited_docs = text_splitter.split_documents(documents)
print('Number of chuncks ', len(splited_docs))

Number of chuncks  95


In [3]:
llm_embeddings = HuggingFaceHubEmbeddings()
flan_ul2 = HuggingFaceHub(repo_id="google/flan-ul2", model_kwargs={"temperature":0.1, "max_new_tokens":300})

  from .autonotebook import tqdm as notebook_tqdm
You're using a different task than the one specified in the repository. Be sure to know what you're doing :)


In [4]:
persist_directory = 'db_' + file_name
db = None

if os.path.isdir(persist_directory):
    db = Chroma(persist_directory=persist_directory, embedding_function=llm_embeddings)
else:
    db = Chroma.from_documents(splited_docs, llm_embeddings, persist_directory=persist_directory)
    db.persist()


Using embedded DuckDB with persistence: data will be stored in: db_state_of_the_union.txt


In [None]:
# Opensearch
#docsearch = OpenSearchVectorSearch.from_documents(docs, llm_embeddings, opensearch_url="http://localhost:9200")
#docs = docsearch.similarity_search(query)

In [5]:
# Chroma
#response_docs = db.similarity_search(query)
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=flan_ul2, chain_type="stuff", retriever=retriever, return_source_documents=True)

In [6]:
result = qa({"query": query})
result['result']

'One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence'

In [9]:
len(result['source_documents'])

4