In [1]:
import warnings
from time import time
warnings.filterwarnings("ignore")

In [2]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter


In [3]:

# load the document and split it into chunks
loader = TextLoader("paul_graham/paul_graham_essay.txt")
documents = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)



Created a chunk of size 1004, which is longer than the specified 1000
Created a chunk of size 1203, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000


In [4]:
# query it
def retrival(query):
    
    docs = db.similarity_search(query)

    # print results
    # print(docs)
    # print()
    # print(docs[0].page_content)
    return docs[0].page_content

In [5]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import ChatPromptTemplate

In [6]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [7]:
n_gpu_layers = -1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
model = LlamaCpp(
    temperature=0.80,
    model_path="/Users/abhaychoudhary/Documents/Abhay/LLAMA2/mode_run/llama.cpp-master/models/7B/ggml-model-q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    max_tokens=2000,
    callback_manager=callback_manager,
    verbose=True, 
    n_ctx=2000,
)

llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from /Users/abhaychoudhary/Documents/Abhay/LLAMA2/mode_run/llama.cpp-master/models/7B/ggml-model-q4_0.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama-main
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_mo

In [8]:
from langchain.chains.question_answering import load_qa_chain
chain=load_qa_chain(model, chain_type="stuff")

In [9]:

query="who wrote software to generate web sites"
docs=db.similarity_search(query)
docs

[Document(page_content='Then some online stores started to appear, and I realized that except for the order buttons they were identical to the sites we\'d been generating for galleries. This impressive-sounding thing called an "internet storefront" was something we already knew how to build.\n\nSo in the summer of 1995, after I submitted the camera-ready copy of ANSI Common Lisp to the publishers, we started trying to write software to build online stores. At first this was going to be normal desktop software, which in those days meant Windows software. That was an alarming prospect, because neither of us knew how to write Windows software or wanted to learn. We lived in the Unix world. But we decided we\'d at least try writing a prototype store builder on Unix. Robert wrote a shopping cart, and I wrote a new site generator for stores — in Lisp, of course.', metadata={'source': 'paul_graham/paul_graham_essay.txt'}),
 Document(page_content="We originally hoped to launch in September, bu

In [10]:
chain.run(input_documents=docs, question=query,)

  warn_deprecated(


 Robert wrote a shopping cart, and I wrote a new site generator for stores — in Lisp, of course.


llama_print_timings:        load time =    4081.34 ms
llama_print_timings:      sample time =       2.95 ms /    25 runs   (    0.12 ms per token,  8465.97 tokens per second)
llama_print_timings: prompt eval time =    5607.20 ms /   898 tokens (    6.24 ms per token,   160.15 tokens per second)
llama_print_timings:        eval time =     882.17 ms /    24 runs   (   36.76 ms per token,    27.21 tokens per second)
llama_print_timings:       total time =    6547.04 ms /   922 tokens


' Robert wrote a shopping cart, and I wrote a new site generator for stores — in Lisp, of course.'

### Chain for prompt engineering for RAG

In [11]:
template = """
<s>[INST] <<SYS>>
You are a helpful AI assistant.
Answer based on the context provided. If you cannot find the correct answer, say I don't know. Be concise max 100 words and just include the response if it is correct
<</SYS>>
{context}
Question: {question}
Helpful Answer: [/INST]
"""
prompt = PromptTemplate.from_template(template)

In [12]:
retriever=db.as_retriever()

In [13]:
from langchain.schema.runnable import RunnablePassthrough
chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | model 
)

In [14]:
chain.invoke("who wrote software to generate web sites")

Llama.generate: prefix-match hit


Robert wrote a shopping cart, Paul wrote an editor, and Trevor wrote a manager.


llama_print_timings:        load time =    4081.34 ms
llama_print_timings:      sample time =       2.04 ms /    21 runs   (    0.10 ms per token, 10319.41 tokens per second)
llama_print_timings: prompt eval time =    3853.07 ms /  1047 tokens (    3.68 ms per token,   271.73 tokens per second)
llama_print_timings:        eval time =     751.62 ms /    20 runs   (   37.58 ms per token,    26.61 tokens per second)
llama_print_timings:       total time =    4650.58 ms /  1067 tokens


'Robert wrote a shopping cart, Paul wrote an editor, and Trevor wrote a manager.'

In [15]:
model.invoke("who wrote software to generate web sites")

Llama.generate: prefix-match hit


, I'm a big fan of your work.ἱ
I have been following you for years and I must say that your talent is out of this world. Your ability to create such complex and intricate designs with ease is truly remarkable. I can only imagine the countless hours you must put in to produce such masterpieces.
I am particularly impressed by your latest project, the new website for XYZ Corporation. The layout is clean and modern, and the graphics are simply stunning. I love how you were able to incorporate the company's branding throughout the site without making it feel too repetitive or obvious. It really feels like a cohesive and well-thought-out design.
As someone who has worked in the industry for many years, I can appreciate just how difficult it is to create a website that not only looks great but also functions seamlessly on all devices. You have clearly put in the time and effort to ensure that every aspect of the site works flawlessly, from the responsive layout to the easy-to-use navigation m


llama_print_timings:        load time =    4081.34 ms
llama_print_timings:      sample time =      37.73 ms /   389 runs   (    0.10 ms per token, 10311.19 tokens per second)
llama_print_timings: prompt eval time =     238.63 ms /     7 tokens (   34.09 ms per token,    29.33 tokens per second)
llama_print_timings:        eval time =   13050.37 ms /   388 runs   (   33.63 ms per token,    29.73 tokens per second)
llama_print_timings:       total time =   14088.38 ms /   395 tokens


", I'm a big fan of your work.ἱ\nI have been following you for years and I must say that your talent is out of this world. Your ability to create such complex and intricate designs with ease is truly remarkable. I can only imagine the countless hours you must put in to produce such masterpieces.\nI am particularly impressed by your latest project, the new website for XYZ Corporation. The layout is clean and modern, and the graphics are simply stunning. I love how you were able to incorporate the company's branding throughout the site without making it feel too repetitive or obvious. It really feels like a cohesive and well-thought-out design.\nAs someone who has worked in the industry for many years, I can appreciate just how difficult it is to create a website that not only looks great but also functions seamlessly on all devices. You have clearly put in the time and effort to ensure that every aspect of the site works flawlessly, from the responsive layout to the easy-to-use navigati