# Llama - 2  Retrieval Augmented Generation

In [1]:
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
!nvidia-smi

Fri Sep 29 23:15:48 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090         On | 00000000:17:00.0 Off |                  N/A |
|  0%   49C    P8               21W / 350W|   1140MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device':device, 'batch_size':32}
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
docs = [
    "this is one document",
    "and another document",
    "My name is Anshuman",
    "Kill him a and all that it takes",
    "why is he like that?"

]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 5 doc embeddings, each with a dimensionality of 384.


---

## Upsert to Vector database - Pinecone

In [6]:
import os
import pinecone
from tqdm import tqdm

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY',
    environment = os.environ.get('PINECONE_ENV') or 'PINECONE_ENV',
)

In [7]:
import time

index_name = 'llama-2-rag'

if index_name not in pinecone.list_indexes():
  pinecone.create_index(
      index_name,
      dimension=len(embeddings[0]),
      metric='cosine'
  )

  while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

  print("Index Ready")

In [8]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.04838,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

In [20]:

from datasets import load_dataset

data = load_dataset(
    'jamescalam/llama-2-arxiv-papers-chunked',
    split='train'
)
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 9676
})

In [10]:
data = data.to_pandas()

batch_size = 32

for i in tqdm(range(0, len(data), batch_size), desc="Upsert", unit='batch'):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    texts = [x['chunk'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata), show_progress=True)

print('Upsert Done!')

Upsert:   0%|          | 0/303 [00:00<?, ?batch/s]

Upsert: 100%|██████████| 303/303 [03:58<00:00,  1.27batch/s]

Upsert Done!





In [21]:
index.describe_index_stats()


{'dimension': 384,
 'index_fullness': 0.04838,
 'namespaces': {'': {'vector_count': 4838}},
 'total_vector_count': 4838}

## Initializing the Hugging Face Pipeline

In [9]:
from torch import bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

hf_auth = os.environ.get('HUGGINGFACE_ENV') or 'HUGGINGFACE_ENV'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth,
    resume_download = True

)
model.eval()
print(f"Model loaded on {device}")

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

In [None]:

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [None]:
res = generate_text("Explain to me the difference between nuclear fission and fusion.")
print(res[0]["generated_text"])

## Running RetrievalQA Chain