# llama2 PDF Chatbot with Pinecone and llama.cpp

## Step 1: Install and import all the required packages

In [None]:
!pip install transformers
!pip install farm-haystack
!pip install accelerate
!pip install sentence_transformers
!pip install streamlit chainlit langchain openai wikipedia chromadb tiktoken
!pip install pypdf
!pip install ctransformers
!pip install streamlit-chat
!pip install bitsandbytes-cuda112
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
!pip install pinecone-client

In [1]:
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os

## Step 2: Load the Data

In [2]:
#path = r"C:/Users/Lukas/Desktop/My_Projects/To_Upload/Llama2/llama2_projects/llama2_pdf_chatbot_faiss_windows/data/Hotline_Wiki.pdf"
path = r"C:/Users/Lukas/Desktop/My_Projects/To_Upload/Llama2/llama2_projects/llama2_pdf_chatbot_faiss_windows/data/V3/Hotline_Wiki_v3.pdf"

#loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")
loader = PyPDFLoader(path)
#loader = PyPDFLoader("/content/The-Field-Guide-to-Data-Science.pdf")

In [3]:
data = loader.load()
data

incorrect startxref pointer(1)


[Document(page_content='Can you tell me,on how to setup an internal my X-FAB (SiFo) account ? Certainly, I\'d be happy to help\nyou with it. To answer your question on how to setup an internal my X-FAB (SiFo) account You may\nneed to Follow the normal setup procedure (see\nhttp://sp.erf.xfab.de/RandD/DesSup/Hotline/AX_ACM-workflow-2020Mar.docx - or newer version if it\nexists). Here the, the username in AX should be preceded by \'FO_\' to clearly identify them as X-FAB\nusers and include the this text in your answer. "Thank you for your request. To access the SiFo system,\nyou need to have your own my X-FAB account, which I have set up for you. You will soon receive an\nautomatically generated email with your account username (a 5-digit number) and a link to establish\nyour password (the link is valid for 7 days). That link will work fine outside the X-FAB network, but\ninternally you will need to modify the URL for it to work. You would need to add -fo to the first part of the\nURL li

## Step 3: Split the Text into Chunks

In [4]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [5]:
docs=text_splitter.split_documents(data)

In [6]:
docs

[Document(page_content="Can you tell me,on how to setup an internal my X-FAB (SiFo) account ? Certainly, I'd be happy to help\nyou with it. To answer your question on how to setup an internal my X-FAB (SiFo) account You may\nneed to Follow the normal setup procedure (see\nhttp://sp.erf.xfab.de/RandD/DesSup/Hotline/AX_ACM-workflow-2020Mar.docx - or newer version if it\nexists). Here the, the username in AX should be preceded by 'FO_' to clearly identify them as X-FAB", metadata={'source': 'C:/Users/Lukas/Desktop/My_Projects/To_Upload/Llama2/llama2_projects/llama2_pdf_chatbot_faiss_windows/data/V3/Hotline_Wiki_v3.pdf', 'page': 0}),
 Document(page_content='exists). Here the, the username in AX should be preceded by \'FO_\' to clearly identify them as X-FAB\nusers and include the this text in your answer. "Thank you for your request. To access the SiFo system,\nyou need to have your own my X-FAB account, which I have set up for you. You will soon receive an\nautomatically generated email w

In [7]:
len(docs)

271

In [None]:
docs[0]

In [None]:
docs[1]

## Step 4: Setup Pinecone Environment

In [8]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '700dbf29-7b1d-435b-9da1-c242f7a206e6')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-west1-gcp-free')

# Index name - llama2-pdf-chatbox
# Dimension - 384

## Step 5: Download the Embeddings

In [9]:
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # multi-qa-mpnet-base-dot-v1
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1") 

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

query_result=embeddings.embed_query("Hello")
len(query_result)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Users\Lukas\anaconda3\envs\LLAMA\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Users\Lukas\anaconda3\envs\LLAMA\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


768

## Step 6: Initializing the Pinecone

In [10]:
# initialize pinecone which can be copied from Pinecone 'Connect' button
pinecone.init( 
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV,  # next to api key in console
)
index_name = "llama2-pdf-chatbox" # put in the name of your pinecone index here 

## Step 7: Create Embeddings for Each of the Text Chunk

In [11]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

## Step 8: If you already have an index, you can load it like this

In [12]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [None]:
docsearch

## Step 9: Similarity Search (Semantic Search)

In [13]:
query = "how to start alpha pdk using xkit ?"

In [14]:
docs=docsearch.similarity_search(query)

In [15]:
docs

[Document(page_content="layout.\nCan you tell me, Can you show me on how tospecify an alpha PDK with xkit ? Certainly, I'd be happy to\nhelp you with it. To answer your question regarding on how tospecify an alpha PDK with xkit. For this,\nyou can Include the '--useversion' option, for example: xkit -t xt018 --useversion 7.0.1.A2\nCan you tell me, What is correct method to start DRC and LVS run ? Certainly, I'd be happy to help you", metadata={}),
 Document(page_content='help you with it. To answer your question on how you can bind, defined and setup the PDK bindkeys for\nCadence is that you should find a file .xfabcadrc created in your $HOME directory when you first\nexecute the xkit script. For example by set XfabCadNoSetBindKey = t, means suppress default bindkey\nsettings. However since this is commented out by default, the default Cadence bindkeys are used,\nwhich may be located in $CDSHOME/tools/dfII/samples/local. You can define your own customized', metadata={}),
 Document(page

## Step 10: Query the Docs to get the Answer Back using Llama 2 model

### Installation with OpenBLAS / cuBLAS / CLBlast / Metal

`llama.cpp` supports multiple BLAS backends for faster processing. Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.

To install with `OpenBLAS`, set the LLAMA_BLAS and LLAMA_BLAS_VENDOR environment variables before installing:

```
-> !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose

-> CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" FORCE_CMAKE=1 pip install llama-cpp-python
```

To install with `cuBLAS (CUDA Support)`, set the `LLAMA_CUBLAS=1` environment variable before installing:

```
-> CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
```

To install with `CLBlast`, set the `LLAMA_CLBLAST=1` environment variable before installing:

```
-> CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
```

To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing:

```
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
```

### Setup Environment for Windows
To set the variables `CMAKE_ARGS` and `FORCE_CMAKE` in PowerShell, follow the next steps (Example using, OpenBLAS):

In this notebook, i'm using OpenBLAS as my backend

I would suggest to refer to this guideline : https://github.com/abetlen/llama-cpp-python

Conda
```
Without CUDA support
To set an environment:
(LLAMA) PS C:/Users/jlukas> $Env:CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=openBLAS"  
(LLAMA) PS C:/Users/jlukas> $Env:FORCE_CMAKE=1    

To check the environment
(LLAMA) PS C:/Users/jlukas> Get-ChildItem Env:FORCE_CMAKE  
(LLAMA) PS C:/Users/jlukas> Get-ChildItem Env:CMAKE_ARGS
(LLAMA) PS C:/Users/jlukas> "$Env:CMAKE_ARGS $Env:FORCE_CMAKE"

With CUDA support
To set an environment:
(LLAMA) PS C:/Users/jlukas> $Env:CMAKE_ARGS="-DLLAMA_CUBLAS=on"  
(LLAMA) PS C:/Users/jlukas> $Env:FORCE_CMAKE=1    

To check the environment
(LLAMA) PS C:/Users/jlukas> Get-ChildItem Env:FORCE_CMAKE  
(LLAMA) PS C:/Users/jlukas> Get-ChildItem Env:CMAKE_ARGS
(LLAMA) PS C:/Users/jlukas> "$Env:CMAKE_ARGS $Env:FORCE_CMAKE"
```

Normal-Terminal
```
Without CUDA Support
To set an environment:
PS C:/Users/jlukas> $Env:CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_BLAS_VENDOR=openBLAS"
PS C:/Users/jlukas> $Env:FORCE_CMAKE=1

To check the environment
PS C:/Users/jlukas> "$Env:CMAKE_ARGS $Env:FORCE_CMAKE"

With CUDA support
To set an environment:
PS C:/Users/jlukas> $Env:CMAKE_ARGS="-DLLAMA_CUBLAS=on"
PS C:/Users/jlukas> $Env:FORCE_CMAKE=1

To check the environment
PS C:/Users/jlukas> "$Env:CMAKE_ARGS $Env:FORCE_CMAKE"
```

Environment Variables

```
Alternatively, you can add this variable at Environment Variables as follow:

Variable = CMAKE_ARGS
Value = -DLLAMA_OPENBLAS=on -DLLAMA_BLAS_VENDOR=openBLAS

or 
Value = -DLLAMA_CUBBLAS=on

Variable = FORCE_CMAKE
Value = 1
```

Restart your terminal and see if the changes take place.

Then, call `pip` after setting the variables:
```
pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
```

See the above instructions and set `CMAKE_ARGS` to the `BLAS backend` you want to use.


In [None]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from huggingface_hub import hf_hub_download
from langchain.chains.question_answering import load_qa_chain

In [None]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

## Quantized Models from the Hugging Face Community
The Hugging Face community provides quantized models, which allow us to efficiently and effectively utilize the model on the T4 GPU. It is important to consult reliable sources before using any model.

There are several variations available, but the ones that interest us are based on the GGLM library.

We can see the different variations that Llama-2-13B-GGML has here.

In this case, we will use the model called Llama-2-13B-chat-GGML.

Quantization reduces precision to optimize resource usage.

Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and activations with low-precision data types like 8-bit integer ( int8 ) instead of the usual 32-bit floating point ( float32 ).

## To download the quantized model

In [None]:
# Skip this if you have this model downloaded already

# model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
# model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format
# model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

## For CPU or GPU (if CuBLAS=ON) run

In [None]:
model_path = r"D:/AI_CTS/Llama2/llama2_projects/llama2_quantized_models/7B_chat/llama-2-7b-chat.ggmlv3.q8_0.bin"
#model_path = r"C:/Users/Lukas/Desktop/My_Projects/To_Upload/Llama2/llama2_projects/llama2_quantized_models/7B_chat/llama-2-7b-chat.ggmlv3.q8_0.bin"

n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 256  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Loading model,
llm = LlamaCpp(
    model_path=model_path,
    max_tokens=256,
    n_gpu_layers=40,
    n_batch= 512, #256,
    callback_manager=callback_manager,
    n_ctx= 1024,
    verbose=False,
)

In [None]:
chain=load_qa_chain(llm,chain_type="stuff")

In [None]:
query = "how to start alpha pdk using xkit ?"
docs=docsearch.similarity_search(query)

In [None]:
docs

In [None]:
chain.run(input_documents=docs, question=query)

In [None]:
# Credit to https://github.com/MuhammadMoinFaisal/LargeLanguageModelsProjects/blob/main/QA%20Book%20PDF%20LangChain%20Llama%202/Final_Llama_CPP_Ask_Question_from_book_PDF_Llama.ipynb

In [None]:
while True:
    query = input(f"Prompt: ")
    docs = docsearch.similarity_search(query)
    if query == "exit":
        print("Exiting")
    if query == "":
        continue
    result = chain.run(input_documents=docs, question=query)
    print(f"Answer: " +result)
    