In [2]:


from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)
     


In [11]:


docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")
     


We have 2 doc embeddings, each with a dimensionality of 384.


In [10]:
! pip install -q pinecone-client==2.2.2 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[33mDEPRECATION: celery 5.0.5 has a non-standard dependency specifier pytz>dev. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of celery or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [15]:

import os
import pinecone

# get API key from app.pinecone.io and environment from console
PINECONE_API_KEY='6021834d-e5d3-4a60-9394-c7716febf6bf'#'bc795b6a-d0aa-4877-bdc6-414135eb4bef'
PINECONE_ENV='gcp-starter'
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY') or PINECONE_API_KEY,
    environment=os.environ.get('PINECONE_ENVIRONMENT') or PINECONE_ENV
)
     

Now we initialize the index.

In [16]:

import time

index_name = 'llama-2-rag_2'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)
else:
    docsearch = Pinecone.from_existing_index(index_name, embeddings)

Now we connect to the index:

In [18]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [19]:
from datasets import load_dataset

data = load_dataset(
    'jamescalam/llama-2-arxiv-papers-chunked',
    split='train'
)
data

Downloading readme:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [20]:
data = data.to_pandas()

batch_size = 32

for i in range(0, len(data), batch_size):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    texts = [x['chunk'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [21]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.04576,
 'namespaces': {'': {'vector_count': 4576}},
 'total_vector_count': 4576}

In [22]:
display(data)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
2,1102.0183,2,promising architectures for such tasks. The mo...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
3,1102.0183,3,"Mutch and Lowe, 2008), whose lters are xed, ...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
4,1102.0183,4,We evaluate various networks on the handwritte...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833,2307.09288,315,"BytheCentralLimitTheorem, Zntendstowardsastand...",2307.09288,Llama 2: Open Foundation and Fine-Tuned Chat M...,"In this work, we develop and release Llama 2, ...",http://arxiv.org/pdf/2307.09288,"[Hugo Touvron, Louis Martin, Kevin Stone, Pete...","[cs.CL, cs.AI]",,,cs.CL,20230718,20230719,"[{'id': '2305.13245', 'title': 'GQA: Training ..."
4834,2307.09288,316,Table 52 presents a model card (Mitchell et al...,2307.09288,Llama 2: Open Foundation and Fine-Tuned Chat M...,"In this work, we develop and release Llama 2, ...",http://arxiv.org/pdf/2307.09288,"[Hugo Touvron, Louis Martin, Kevin Stone, Pete...","[cs.CL, cs.AI]",,,cs.CL,20230718,20230719,"[{'id': '2305.13245', 'title': 'GQA: Training ..."
4835,2307.09288,317,models will be released as we improve model sa...,2307.09288,Llama 2: Open Foundation and Fine-Tuned Chat M...,"In this work, we develop and release Llama 2, ...",http://arxiv.org/pdf/2307.09288,"[Hugo Touvron, Louis Martin, Kevin Stone, Pete...","[cs.CL, cs.AI]",,,cs.CL,20230718,20230719,"[{'id': '2305.13245', 'title': 'GQA: Training ..."
4836,2307.09288,318,Training Factors We usedcustomtraininglibrarie...,2307.09288,Llama 2: Open Foundation and Fine-Tuned Chat M...,"In this work, we develop and release Llama 2, ...",http://arxiv.org/pdf/2307.09288,"[Hugo Touvron, Louis Martin, Kevin Stone, Pete...","[cs.CL, cs.AI]",,,cs.CL,20230718,20230719,"[{'id': '2305.13245', 'title': 'GQA: Training ..."


In [24]:
from llama_cpp import Llama

model_path = "./llama_models/llama2-chat-ayb-13b.Q5_K_M.gguf"
llm = Llama(model_path=model_path, n_ctx=4096)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3070 Laptop GPU, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ./llama_models/llama2-chat-ayb-13b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama

In [27]:
from langchain.vectorstores import Pinecone

text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



In [28]:
query = 'what makes llama 2 special?'

vectorstore.similarity_search(
    query,  # the search query
    k=3  # returns top 3 most relevant chunks of text
)

[Document(page_content='Ricardo Lopez-Barquilla, Marc Shedroﬀ, Kelly Michelena, Allie Feinstein, Amit Sangani, Geeta\nChauhan,ChesterHu,CharltonGholson,AnjaKomlenovic,EissaJamil,BrandonSpence,Azadeh\nYazdan, Elisa Garcia Anzano, and Natascha Parks.\n•ChrisMarra,ChayaNayak,JacquelinePan,GeorgeOrlin,EdwardDowling,EstebanArcaute,Philomena Lobo, Eleonora Presani, and Logan Kerr, who provided helpful product and technical organization support.\n46\n•Armand Joulin, Edouard Grave, Guillaume Lample, and Timothee Lacroix, members of the original\nLlama team who helped get this work started.\n•Drew Hamlin, Chantal Mora, and Aran Mun, who gave us some design input on the ﬁgures in the\npaper.\n•Vijai Mohan for the discussions about RLHF that inspired our Figure 20, and his contribution to the\ninternal demo.\n•Earlyreviewersofthispaper,whohelpedusimproveitsquality,includingMikeLewis,JoellePineau,\nLaurens van der Maaten, Jason Weston, and Omer Levy.', metadata={'source': 'http://arxiv.org/pdf/230

In [32]:
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
class LlamaLLM(LLM):
    model_path: str
    llm: Llama

    @property
    def _llm_type(self) -> str:
        return "llama-cpp-python"

    def __init__(self, model_path: str, **kwargs: Any):
        model_path = model_path
        llm = Llama(model_path=model_path,**kwargs)
        super().__init__(model_path=model_path, llm=llm, **kwargs)

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.llm(prompt, stop=stop or [])
        return response["choices"][0]["text"]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"model_path": self.model_path}

llm = LlamaLLM(model_path=model_path,n_ctx=4096)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ./llama_models/llama2-chat-ayb-13b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K     [  5120,  5120,     

In [33]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

In [34]:
llm('what is so special about llama 2?')


llama_print_timings:        load time =    3546.56 ms
llama_print_timings:      sample time =      44.64 ms /   128 runs   (    0.35 ms per token,  2867.32 tokens per second)
llama_print_timings: prompt eval time =    3545.56 ms /    11 tokens (  322.32 ms per token,     3.10 tokens per second)
llama_print_timings:        eval time =  434110.19 ms /   127 runs   ( 3418.19 ms per token,     0.29 tokens per second)
llama_print_timings:       total time =  438359.30 ms


'\nWhy did it took so long for LLAMA 2 to be deployed?\nA little background: The Lower Lasalle Avenue Assessment Method (LLAMA) is a web-based tool developed by the U.S. Department of Energy (DOE) to estimate whole-building energy consumption and identify cost-effective energy efficiency measures. LLAMA 1 was released in August 2009, but it did not incorporate energy savings from onsite renewable energy generation or include energy use for plug loads.\nLLAMA 2, which addresses these shortcomings and includes several'

In [35]:
rag_pipeline('what is so special about llama 2?')

Llama.generate: prefix-match hit

llama_print_timings:        load time =    3546.56 ms
llama_print_timings:      sample time =      48.59 ms /   128 runs   (    0.38 ms per token,  2634.23 tokens per second)
llama_print_timings: prompt eval time =   29660.79 ms /  1325 tokens (   22.39 ms per token,    44.67 tokens per second)
llama_print_timings:        eval time =  444290.26 ms /   127 runs   ( 3498.35 ms per token,     0.29 tokens per second)
llama_print_timings:       total time =  474566.29 ms


{'query': 'what is so special about llama 2?',
 'result': ' LLAMA 2 is a collection of pretrained and fine-tuned large language models (LLMs) ranging from 7 billion to 70 billion parameters. What sets it apart is that these fine-tuned models, called L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc are optimized for dialogue use cases and outperform open-source chat models on most benchmarks tested. Additionally, based on human evaluations for helpfulness and safety, they may serve'}