# Llama -2 RAG on IX History text

## Package Import

In [1]:
import os
import torch
import pinecone
from tqdm import tqdm

from langchain.vectorstores import Pinecone
from langchain.llms import HuggingFacePipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader




  from tqdm.autonotebook import tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [8]:
!nvidia-smi

Sat Sep 30 20:43:07 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090         On | 00000000:17:00.0 Off |                  N/A |
| 30%   39C    P8               20W / 350W|  19060MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Prepare Llama-2 model

In [5]:
from torch import bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

hf_auth = os.environ.get('HUGGINGFACE_ENV') or 'HUGGINGFACE_ENV'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth,
    resume_download = True

)
model.eval()
print(f"Model loaded on {device}")

ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



## Prepare Embedding Object

In [None]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    encode_kwargs = {'device':device, 'batch_size':32}
)

In [None]:
# test embedding
docs = [
    "Kill him a and all that it takes",
    "why is he like that?"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

## Pinecone (Vector DB)

In [None]:


# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY',
    environment = os.environ.get('PINECONE_ENV') or 'PINECONE_ENV',
)

#### Initiate Pinecone Index

In [None]:
import time

index_name = 'ix-history-2'

if index_name not in pinecone.list_indexes():
  pinecone.create_index(
      index_name,
      dimension=len(embeddings[0]),
      metric='cosine'
  )

  while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

  print("Index Ready")

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

#### Loading Raw data

In [None]:
pdfLocation = "../datasets/Social/iess302.pdf"

loader = PyPDFLoader(pdfLocation)
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

Now you have 66 documents


### Move embedding to Pinecone Index

In [None]:

ids = [f"{i}-{texts[i].metadata['page']}" for i in range(len(texts))]
textList = [text.page_content for text in texts]
embeds = embed_model.embed_documents(textList) 
metadata = [{
    'text': text.page_content,
         'source':text.metadata['source'],
         'page': text.metadata['page']
} for text in texts]

index.upsert(vectors=zip(ids, embeds, metadata), show_progress=True)

print('Upsert Done!')

Upsert Done!


## Preparing LLM Pipeline

In [None]:

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    framework="pt",
)

In [None]:
llm = HuggingFacePipeline(pipeline=generate_text)

#### hf RAG Pipeline

In [None]:
text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [None]:
query = "What were the social, economic and political conditions in Russia before 1905?"

vectorstore.similarity_search(query, k=6)
print(vectorstore)

<langchain.vectorstores.pinecone.Pinecone object at 0x7fe7f46e4280>


In [None]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

In [None]:
content = rag_pipeline('What were the social, economic and political conditions in Russia before 1905?')
print(content['result'])

 The Russian Revolution was a pair of revolutions that took place in Russia in 1917, which led to the overthrow of the monarchy and the establishment of the world's first socialist state. The first revolution, which occurred in February, forced Tsar Nicholas II to abdicate and established a provisional government. The second revolution, which occurred in October, overthrew the provisional government and established a communist government led by Vladimir Lenin and the Bolshevik Party.
