In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
import sys
import datetime

In [2]:

# download model here https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/blob/main/llama-2-7b-chat.ggmlv3.q4_0.bin
loader = TextLoader("llama2_text.txt")
documents=loader.load()

In [19]:
text_splitter=RecursiveCharacterTextSplitter(
                                             chunk_size=500,
                                             chunk_overlap=20)

In [20]:
text_chunks=text_splitter.split_documents(documents)

print(len(text_chunks))

34


In [21]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device':'cpu'})


#**Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store***
vector_store=FAISS.from_documents(text_chunks, embeddings)


##**Step 5: Find the Top 3 Answers for the Query***

query="YOLOv7 outperforms which models"
docs = vector_store.similarity_search(query)

In [22]:
vector_store.as_retriever(search_kwargs={'k': 2})

VectorStoreRetriever(tags=None, metadata=None, vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x00000209BDAEF6D0>, search_type='similarity', search_kwargs={'k': 2})

In [23]:
llm=CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':500,
                          'temperature':0.1})

In [27]:
template="""Use the following pieces of information to answer the user's question.
If you dont know the answer just say you know, don't try to make up an answer.

Context:{context}
Question:{question}

Only return the helpful answer below and nothing else
Helpful answer
"""

qa_prompt=PromptTemplate(template=template, input_variables=['context', 'question'])

#start=timeit.default_timer()

chain = RetrievalQA.from_chain_type(llm=llm,
                                   chain_type='stuff',
                                   retriever=vector_store.as_retriever(search_kwargs={'k': 5}),
                                   return_source_documents=True,
                                   chain_type_kwargs={'prompt': qa_prompt})

In [28]:
chain

RetrievalQA(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, combine_documents_chain=StuffDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="Use the following pieces of information to answer the user's question.\nIf you dont know the answer just say you know, don't try to make up an answer.\n\nContext:{context}\nQuestion:{question}\n\nOnly return the helpful answer below and nothing else\nHelpful answer\n", template_format='f-string', validate_template=True), llm=CTransformers(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<ctransformers.llm.LLM object at 0x0

In [None]:
a = datetime.datetime.now()
question="when was llama announced?"
chat_history=[]
result=chain({'query':question,"chat_history": chat_history})
print(result['result'])
b = datetime.datetime.now()
print("time taken -",b-a)

In [None]:
a = datetime.datetime.now()
question="What is length of  Llama 2 - Chat? "
chat_history=[]
result=chain({'query':question,"chat_history": chat_history})
print(result['result'])
b = datetime.datetime.now()
print("time taken -",b-a)