In [1]:
import warnings
from time import time
warnings.filterwarnings("ignore")

In [2]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter


In [3]:
# # Use this to load the pdf document
# from langchain_community.document_loaders import PyPDFLoader
# loader = PyPDFLoader("abc.pdf")
# documents = loader.load_and_split()

In [4]:
# # load the document and split it into chunks
loader = TextLoader("../paul_graham/paul_graham_essay.txt")
print(type(loader))
documents = loader.load()

<class 'langchain_community.document_loaders.text.TextLoader'>


In [5]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

retriever=db.as_retriever()

Created a chunk of size 1004, which is longer than the specified 1000
Created a chunk of size 1203, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000


In [6]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import ChatPromptTemplate

In [7]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [8]:
pwd

'/Users/abhaychoudhary/Documents/Abhay/LLAMA2/LLAMA2-RAG/RAG_langchain'

In [17]:
n_gpu_layers = -1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
model = LlamaCpp(
    temperature=0.80,
    model_path="../model/ggml-model-q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    max_tokens=2000,
    callback_manager=callback_manager,
    verbose=True, 
    n_ctx=4000,
)

llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../model/ggml-model-q4_0.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama-main
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32    

In [18]:
# semantic search on cromadb
query="who wrote software to generate web sites"
docs=db.similarity_search(query)
docs

[Document(page_content='Then some online stores started to appear, and I realized that except for the order buttons they were identical to the sites we\'d been generating for galleries. This impressive-sounding thing called an "internet storefront" was something we already knew how to build.\n\nSo in the summer of 1995, after I submitted the camera-ready copy of ANSI Common Lisp to the publishers, we started trying to write software to build online stores. At first this was going to be normal desktop software, which in those days meant Windows software. That was an alarming prospect, because neither of us knew how to write Windows software or wanted to learn. We lived in the Unix world. But we decided we\'d at least try writing a prototype store builder on Unix. Robert wrote a shopping cart, and I wrote a new site generator for stores — in Lisp, of course.', metadata={'source': '../paul_graham/paul_graham_essay.txt'}),
 Document(page_content="We originally hoped to launch in September,

In [19]:
#Invoking model without context
model.invoke("who wrote software to generate web sites")

 using a markup language.его name is Tim Berners-Lee, and he developed the first web browser in 1990. He also created the HTML and URL protocols, which are still used today to build and access websites.




llama_print_timings:        load time =    1768.94 ms
llama_print_timings:      sample time =       5.88 ms /    53 runs   (    0.11 ms per token,  9019.74 tokens per second)
llama_print_timings: prompt eval time =    1768.88 ms /     8 tokens (  221.11 ms per token,     4.52 tokens per second)
llama_print_timings:        eval time =    1716.69 ms /    52 runs   (   33.01 ms per token,    30.29 tokens per second)
llama_print_timings:       total time =    3595.08 ms /    60 tokens


' using a markup language.его name is Tim Berners-Lee, and he developed the first web browser in 1990. He also created the HTML and URL protocols, which are still used today to build and access websites.\n\n'

### Chain for prompt engineering for RAG

In [20]:
from langchain.chains.question_answering import load_qa_chain
chain=load_qa_chain(model, chain_type="stuff")

ggml_metal_free: deallocating


In [21]:
#creating prompt template for model input
template = """
<s>[INST] <<SYS>>
You are a helpful AI assistant.
Answer based on the context provided. If you cannot find the correct answer, say I don't know. Be concise within 50 words.
<</SYS>>
{context}
Question: {question}
Helpful Answer: [/INST]
"""
prompt = PromptTemplate.from_template(template)

In [22]:
from langchain.schema.runnable import RunnablePassthrough
chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | model 
)

In [23]:
# calling llama2 model with cromadb similarity_search for better response
# chain.invoke("who wrote software to generate web sites")

In [None]:
while True:
    user_input = input("Enter something (or type 'quit' to exit): ")
    if user_input=="quit":
        break
    else:
        result=chain.invoke(user_input)
        print(result)
        print("==========================================================================")
        

Enter something (or type 'quit' to exit):  who wrote software to generate web sites


Llama.generate: prefix-match hit


Paul Graham wrote software to generate web sites. Robert wrote a shopping cart, and Trevor wrote a store manager.


llama_print_timings:        load time =    1768.94 ms
llama_print_timings:      sample time =       2.36 ms /    25 runs   (    0.09 ms per token, 10588.73 tokens per second)
llama_print_timings: prompt eval time =    4033.12 ms /  1042 tokens (    3.87 ms per token,   258.36 tokens per second)
llama_print_timings:        eval time =     902.27 ms /    24 runs   (   37.59 ms per token,    26.60 tokens per second)
llama_print_timings:       total time =    4990.73 ms /  1066 tokens


Paul Graham wrote software to generate web sites. Robert wrote a shopping cart, and Trevor wrote a store manager.
