In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
import os

In [9]:
file_name = 'state_of_the_union.txt'
max_num_of_tokens = 2048
loader = TextLoader(file_name)
query = "What did the president say about Ketanji Brown Jackson?"
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator='\n')
splited_docs = text_splitter.split_documents(documents)
print('Number of chuncks ', len(splited_docs))


Number of chuncks  95


In [3]:
model = "./gpt4all-lora-quantized-ggml.bin"
llm = LlamaCpp(model_path = model, n_ctx=max_num_of_tokens)
llm_embeddings = LlamaCppEmbeddings(model_path = model)

llama.cpp: loading model from ./gpt4all-lora-quantized-ggml.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = ggmf v1 (old version with no mmap support)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 1024
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113744.11 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 2052.00 MB per state)
...................................................................................................
.
llama_init_from_file: k

In [5]:
persist_directory = 'db_' + file_name
db = None

if os.path.isdir(persist_directory):
    db = Chroma(persist_directory=persist_directory, embedding_function=llm_embeddings)
else:
    db = Chroma.from_documents(splited_docs, llm_embeddings, persist_directory=persist_directory)
    db.persist()


Using embedded DuckDB with persistence: data will be stored in: db_state_of_the_union.txt


In [None]:
# Opensearch
#docsearch = OpenSearchVectorSearch.from_documents(docs, llm_embeddings, opensearch_url="http://localhost:9200")
#docs = docsearch.similarity_search(query)

In [6]:
# Chroma
response_docs = db.similarity_search(query)


llama_print_timings:        load time =   650.47 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =  1062.61 ms /    14 tokens (   75.90 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =  1063.99 ms


In [7]:
#small_docs = [Document(page_content=response_docs[0].page_content, metadata=response_docs[0].metadata)]
response_docs

[Document(page_content='We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  \n\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  \n\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '/private/var/folders/vq/sbkvnrvx5g71hm826hr91cw00000gn/T/26fa01dd4b62b09f4f48b523716bdd846615ba86/state_of_the_union.txt'}),
 Document(page_content='Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants. \n\nIf necessary, we’ll be able to deploy new vaccines within 100 days instead of many more months or years.  \n\nAnd, if Congress provides the funds we need, we’ll have new stockpiles of tests, masks, a

In [8]:
chain = load_qa_chain(llm, chain_type="stuff")
responses = []
for rdoc in response_docs:
    responses.append(Document(page_content=chain.run(input_documents=[rdoc], question=query),  metadata=rdoc.metadata))


llama_print_timings:        load time =   613.01 ms
llama_print_timings:      sample time =    13.13 ms /    18 runs   (    0.73 ms per run)
llama_print_timings: prompt eval time = 12175.51 ms /   184 tokens (   66.17 ms per token)
llama_print_timings:        eval time =  1345.36 ms /    17 runs   (   79.14 ms per run)
llama_print_timings:       total time = 13536.08 ms

llama_print_timings:        load time =   613.01 ms
llama_print_timings:      sample time =    52.77 ms /    74 runs   (    0.71 ms per run)
llama_print_timings: prompt eval time = 12591.76 ms /   192 tokens (   65.58 ms per token)
llama_print_timings:        eval time =  5967.13 ms /    74 runs   (   80.64 ms per run)
llama_print_timings:       total time = 18617.97 ms

llama_print_timings:        load time =   613.01 ms
llama_print_timings:      sample time =    22.93 ms /    32 runs   (    0.72 ms per run)
llama_print_timings: prompt eval time = 13763.54 ms /   208 tokens (   66.17 ms per token)
llama_print_timings

In [11]:
chain = load_qa_chain(llm, chain_type="stuff")

while len(responses) != 1:
    c = 0
    length_contenxt = 0
    for i in range(len(responses)):
        l = 40 + len(query.split(' ')) + length_contenxt + len(responses[i].page_content.split(' '))
        if 1024 > l:
            length_contenxt += len(responses[i].page_content.split(' '))
            c += 1
    
    responses.append(Document(page_content=chain.run(input_documents=responses[:c], question=query)))
    responses = responses[c:]



llama_print_timings:        load time =   613.01 ms
llama_print_timings:      sample time =    39.68 ms /    56 runs   (    0.71 ms per run)
llama_print_timings: prompt eval time = 27235.14 ms /   405 tokens (   67.25 ms per token)
llama_print_timings:        eval time =  4898.93 ms /    55 runs   (   89.07 ms per run)
llama_print_timings:       total time = 32179.77 ms


In [12]:
responses

[Document(page_content=' The president mentioned Ketanji Brown Jackson in his speech regarding the expansion of eligibility for veterans with nine respiratory cancers. He also said that she was an attorney, a judge, and a fierce advocate for veterans’ issues.', metadata={})]