In [8]:
# import transformers
# print(transformers.__version__)

import tensorflow as tf
print(tf.__version__)  # Expected output: 2.14.0

import keras
print(keras.__version__)


2.14.0
2.14.0


In [9]:
import os

import base64
import gc
import random
import tempfile
import time
import uuid 

from IPython.display import Markdown, display


from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader

import streamlit as st

In [10]:
# will allow us to make asynchronous calls smoothly in our RAG 
import nest_asyncio

nest_asyncio.apply()

### Set up Qdrant vector database

- Here, "chat_with_docs" is intended to store document embeddings to support query-based information retrieval in our demo.
- qdrant_client.QdrantClient initializes a QdrantClient instance, connecting it to a Qdrant server running locally.

In [11]:
import qdrant_client
collection_name = "chat_with_docs"

client = qdrant_client.QdrantClient(host = "localhost", port = 6333)


### Read the documents

- Next, we'll set up a document loader that reads files from a specified directory and extracts their contents for use in our RAG pipeline.

- This will allow us to retrieve text from PDF files, which we’ll later transform into embeddings and store in the Qdrant vector database created above

In [12]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = './docs'

loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".pdf"],
            recursive=True
        )
docs = loader.load_data()

In [13]:
type(docs), len(docs)

(list, 39)

In [14]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # create an index by embedding each document in documents and storing it in the Qdrant vector store.
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

### Load the embedding model and index data

- we are setting up an embedding model from Hugging Face to convert our documents into vector embeddings, which we’ll then store in Qdrant using our index function.

In [15]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

# This setting ensures that the same model is used throughout our RAG pipeline to maintain consistency in embedding generation
Settings.embed_model = embed_model

# This function we defined earlier, passing in docs (the list of loaded documents). As discussed above, this function converts each document into an embedding using embed_model and stores the embeddings in Qdrant.
index = create_index(docs)

ResponseHandlingException: [WinError 10061] No connection could be made because the target machine actively refused it

### Load the LLM

- We are also specifying a request_timeout of 120 seconds for requests to the LLM to ensure that the system doesn’t get stuck if the model takes too long to respond.

- Finally, like before, we set the above LLM instance as the default language model in Settings, making it the primary model used in our RAG pipeline.
-  we would be running Llama 3.2 1B model instead since its smaller and will not take much memory:

In [None]:
from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

### Define the prompt template

- In this step, we create a prompt template that defines a consistent format to guide the LLM about the context it should look at while answering the query.

In [None]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

### Reranking

This process rearranges the chunks so that the most relevant ones are prioritized for the response generation.

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

#we limit the output to the top 3 most relevant chunks based on the model’s scoring.
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Query the document

- The query engine integrates the retrieval, re-ranking, and prompt-based response generation steps.

In [None]:
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

#response = query_engine.query("What exactly is DSPy?")
response = query_engine.que

In [None]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy stands for "Demonstrate-Search-Predict", which is a programming model developed for natural language processing (NLP) tasks. It provides a way to abstract and automate prompting techniques using natural language signatures, type signatures, parameterized declarative modules, and tele-prompters.

### Limitations:
- The first problem is that a question is usually not semantically similar to its answers.
At least, it is possible for the search to retrieve documents containing the same words as the question or that are used in the same context without providing relevant information to answer the question.Because the search retrieves the most similar documents to the question, depending on the data, too many irrelevant documents may show higher cosine similarity than the documents actually containing the answer.The idea is to use the LLM to generate a hypothetical answer, embed that answer, and use this embedding to query the vector database.

- Semantic similarity can be diluted: If the data have been broken down into big chunks of text, then it is likely to contain multiple different and unrelated information within each chunk.If you perform a similarity search on that data, the pertinent information may be diluted, and the search may return irrelevant documents instead.It is important to break down the data so that each chunk contains no more than a few paragraphs to ensure more "uniqueness" in the concepts developed in each text.

- With the RAG approach, it is very important to limit the type of questions we ask the LLM.If we ask questions that require aggregating data all over the database, the answers are most likely going to be wrong, but the LLM won't be able to know that.However, if the information requires scanning all the documents to find the answer, a similarity search won't find it.