# Llama -2 RAG on IX History text

## Package Import

In [1]:
import os
import torch
import pinecone
from tqdm import tqdm

from langchain.vectorstores import Pinecone
from langchain.llms import HuggingFacePipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, PyPDFDirectoryLoader, PyPDFLoader




  from tqdm.autonotebook import tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
!nvidia-smi

Mon Oct  2 22:53:57 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090         On | 00000000:17:00.0 Off |                  N/A |
|  0%   40C    P8               28W / 350W|   8344MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Prepare Llama-2 model

In [4]:
from torch import bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

hf_auth = os.environ.get('HUGGINGFACE_ENV') or 'HUGGINGFACE_ENV'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth,
    resume_download = True

)
model.eval()
print(f"Model loaded on {device}")

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.83s/it]


Model loaded on cuda


In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

## Prepare Embedding Object

In [7]:
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

embed_model = HuggingFaceEmbeddings(
    model_name = embed_model_id,
    model_kwargs = {'device': device},
    # encode_kwargs = {'device':device, 'batch_size':32}
)

In [8]:
# test embedding
docs = [
    "Kill him a and all that it takes",
    "why is he like that?"
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")

We have 2 doc embeddings, each with a dimensionality of 384.


## Pinecone (Vector DB)

In [9]:


# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY',
    environment = os.environ.get('PINECONE_ENV') or 'PINECONE_ENV',
)

#### Initiate Pinecone Index

In [10]:
import time

index_name = 'anshuman-info'

if index_name not in pinecone.list_indexes():
  pinecone.create_index(
      index_name,
      dimension=len(embeddings[0]),
      metric='cosine'
  )

  while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)

  print("Index Ready")

In [11]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.00019,
 'namespaces': {'': {'vector_count': 19}},
 'total_vector_count': 19}

#### Loading Raw data

In [11]:
pdfLocation = "../datasets/Social/iess302.pdf"
pdf_dir = "../datasets/Social/"
loader = PyPDFDirectoryLoader(pdf_dir)
# loader = PyPDFLoader(pdfLocation)
data = loader.load()
print(data)

[]


In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=40, add_start_index=True )
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

Now you have 66 documents


### Move embedding to Pinecone Index

In [13]:

ids = [f"{i}-{texts[i].metadata['page']}" for i in range(len(texts))]
textList = [text.page_content for text in texts]
embeds = embed_model.embed_documents(textList) 
metadata = [{
    'text': text.page_content,
         'source':text.metadata['source'],
         'page': text.metadata['page']
} for text in texts]

index.upsert(vectors=zip(ids, embeds, metadata), show_progress=True)

print('Upsert Done!')

Upsert Done!


## Preparing LLM Pipeline

In [20]:

generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    framework="pt",
    # verbose=False
)

In [21]:
llm = HuggingFacePipeline(pipeline=generate_text)

#### hf RAG Pipeline

In [14]:
text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [15]:
query = "Why is anshuman amazing?"

vectorstore.similarity_search(query, k=6)
print(vectorstore)

<langchain.vectorstores.pinecone.Pinecone object at 0x7f036ac3ee20>


In [22]:
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain

rag_pipeline = ConversationalRetrievalChain.from_llm(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever(),
)

In [23]:
history = []

In [24]:
prompt = 'How is Anshuman an amazing person, compared to others?'

content = rag_pipeline({"question": prompt, "chat_history": history})
print(content["answer"])

 Based on the information provided, it appears that Anshuman has a strong background in technology and programming, with experience in multiple languages and frameworks. He has also contributed to several open-source projects and has been recognized as a mentor and member of the AmFOSS club. Additionally, he has worked on various projects such as the Antibiotic Stewardship project and the CIR Internship Portal, demonstrating his versatility and ability to work on different types of projects. Overall, it seems that Anshuman is an accomplished and well-rounded individual with a strong skill set and a commitment to open-source software.


In [25]:
history = [(prompt, content["answer"])]

In [28]:
prompt2 = 'But you did not explain how is he compared to others at age of just 19! ?'

content2 = rag_pipeline({"question": prompt2, "chat_history": history})
print(content2["answer"])

  Based on the information provided, it appears that Anshuman has accomplished a great deal for someone of his age. He has experience working with various programming languages and frameworks, has contributed to open-source projects, and has even mentored younger students in these technologies. Additionally, he has completed a bachelor's degree in computer science and artificial intelligence from Amrita Vishwa Vidyapeetham, which suggests that he has a strong foundation in computer science and related fields. However, without more information about the typical accomplishments and skills of individuals at the age of 19, it is difficult to directly compare Anshuman's achievements to those of others in this age group.


: 

## Adding prompt template