In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import (
    StreamingStdOutCallbackHandler
)
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
template = """Question: {question}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["question"])
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [3]:
llm = LlamaCpp(
    model_path="llama-2-7b-chat.ggmlv3.q5_0.bin",
    n_ctx=6000,
    n_gpu_layers=512,
    n_batch=30,
    callback_manager=callback_manager,
    temperature = 0.9,
    max_tokens = 4095,
    n_parts=1,
    
)

llm_chain = LLMChain(prompt=prompt, llm=llm)

llama.cpp: loading model from llama-2-7b-chat.ggmlv3.q5_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 6000
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 8 (mostly Q5_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4418.98 MB (+ 3000.00 MB per state)
llama_new_context_with_model: kv self size  = 3000.00 MB
llama_new_con

In [7]:
loader = UnstructuredFileLoader("test.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

# embedding engine
hf_embedding = HuggingFaceInstructEmbeddings()

Downloading (…)c7233/.gitattributes: 100%|██████████| 1.48k/1.48k [00:00<00:00, 303kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 270/270 [00:00<00:00, 54.5kB/s]
Downloading (…)/2_Dense/config.json: 100%|██████████| 116/116 [00:00<00:00, 72.0kB/s]
Downloading pytorch_model.bin: 100%|██████████| 3.15M/3.15M [00:00<00:00, 4.79MB/s]
Downloading (…)9fb15c7233/README.md: 100%|██████████| 66.3k/66.3k [00:00<00:00, 376kB/s]
Downloading (…)b15c7233/config.json: 100%|██████████| 1.53k/1.53k [00:00<00:00, 907kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 66.3kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [03:40<00:00, 6.07MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 23.1kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 888kB/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 8.61MB/s]
Downloading (…)c7233/tokenizer.json: 100%|██████████|

load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
db = FAISS.from_documents(docs, hf_embedding)

# save embeddings in local directory
db.save_local("faiss_AiArticle")

# load from local
db = FAISS.load_local("faiss_AiArticle/", embeddings=hf_embedding)

In [10]:
query = "what is the main idea of this paper?"
search = db.similarity_search(query, k=2)

In [11]:
template = '''Context: {context}

Based on Context provide me answer for following question
Question: {question}

Tell me the information about the fact. The answer should be from context only
do not use general knowledge to answer the query'''

prompt = PromptTemplate(input_variables=["context", "question"], template= template)
final_prompt = prompt.format(question=query, context=search)

In [12]:
llm_chain.run(final_prompt)

 Based on the provided context, the main idea of this paper appears to be the development and evaluation of a new approach for improving the performance of real-time applications in computer systems. Specifically, the authors propose and evaluate a heuristic called ECON that enhances the capacity of the system to improve its performance in real-time applications. The paper presents a detailed description of the proposed approach, as well as numerical results to compare the performance of ECON with the baseline approach.


llama_print_timings:        load time = 38329.90 ms
llama_print_timings:      sample time =    86.47 ms /    99 runs   (    0.87 ms per token,  1144.85 tokens per second)
llama_print_timings: prompt eval time = 253308.98 ms /   245 tokens ( 1033.91 ms per token,     0.97 tokens per second)
llama_print_timings:        eval time = 102015.26 ms /    98 runs   ( 1040.97 ms per token,     0.96 tokens per second)
llama_print_timings:       total time = 355708.74 ms


' Based on the provided context, the main idea of this paper appears to be the development and evaluation of a new approach for improving the performance of real-time applications in computer systems. Specifically, the authors propose and evaluate a heuristic called ECON that enhances the capacity of the system to improve its performance in real-time applications. The paper presents a detailed description of the proposed approach, as well as numerical results to compare the performance of ECON with the baseline approach.'