# Setup

In [40]:
import os
import json
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from rag.config import ROOT_DIR
from rag.config import EMBEDDING_DIMENSIONS

# Query Retrieval

With embedded chunks now stored in the vector database, we can perform the retrieval of embedded chunks based on input query. We'll start by using the same embedding model that we used to embed our documents to embed the input query.

In [3]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np

In [5]:
embedding_model_name = "thenlper/gte-base"

#Embed query
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
query = "What is the default size for map_batches?"
embedding_query = np.array(embedding_model.embed_query(query))
print(len(embedding_query))

768


Now, we can retrieve the top-k relevant chunks by extracting the closest embedded chunks to our embedded query. We use cosine distance, which is a technique to search for similarities between two vectors to search for closest chunks. There are many other options to select top most chunks. Once we retrieve the `num_chunks`, we can collect the text from each chunk and use it as context to generate a response.

In [16]:
import psycopg
from pgvector.psycopg import register_vector
# Get context text
num_chunks = 5
with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
    register_vector(conn)
    with conn.cursor() as cur:
        cur.execute("SELECT *, (embedding <=> %s) AS similarity_score FROM document ORDER BY similarity_score LIMIT %s", (embedding_query, num_chunks))
        rows = cur.fetchall()
        ids = [row[0] for row in rows]
        context = [{"text": row[1]} for row in rows]
        sources = [row[2] for row in rows]
        scores = [row[4] for row in rows]


In [17]:
for i, item in enumerate(context):
    print (ids[i])
    print (scores[i])
    print (sources[i])
    print (item["text"])
    print ()

19585
0.06648371622447513
https://docs.ray.io/en/master/data/api/doc/ray.data.Dataset.map_batches.html#ray-data-dataset-map-batches
The actual size of the batch provided to fn may be smaller than
batch_size if batch_size doesn’t evenly divide the block(s) sent
to a given map task. Default batch_size is 1024 with “default”.
compute – This argument is deprecated. Use concurrency argument.

25430
0.07046984786153998
https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-size
batch_size.
Note
The default batch size depends on your resource type. If you’re using CPUs,
the default batch size is 4096. If you’re using GPUs, you must specify an explicit
batch size.

25631
0.07540523106988828
https://docs.ray.io/en/master/data/batch_inference.html#configuring-batch-size
# Specify that each input batch should be of size 2.
ds.map_batches(assert_batch, batch_size=2)
Caution
The default batch_size of 4096 may be too large for datasets with large rows
(for example, tables with m

Let's wrap the entire code into a function for easier convenience.

In [18]:
def semantic_search(query, embedding_model, num_chunks=5):
    embedding_query = np.array(embedding_model.embed_query(query))
    with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
        register_vector(conn)
        with conn.cursor() as cur:
            cur.execute("SELECT *, (embedding <=> %s) AS similarity_score FROM document ORDER BY similarity_score LIMIT %s", (embedding_query, num_chunks))
            rows = cur.fetchall()
            semantic_context = [{"id": row[0], "text": row[1], "source": row[2], "score": row[4]} for row in rows]

    return semantic_context

# Answer Generation

We can now use the context to generate a response from our LLM. Without this relevant context that we retrieved, the LLM may not have been able to accurately answer our question. And as our data grows, we can just as easily embed and index any new data and be able to retrieve it to answer questions.

In [19]:
import openai
import time

In [34]:
from rag.utils import get_client
from rag.generate import prepare_response

In [35]:
def generate_response(
        llm, temperature=0.0, stream=True,
        system_content="", assistant_content="", user_content="",
        max_retries=1, retry_interval=60):
    """Generate an response from LLM"""
    retry_count = 0
    client = get_client(llm)
    messages = [{"role": role, "content": content } for role, content in [("system", system_content), ("assistant", assistant_content), ("user", user_content)] if content]
    while retry_count <= max_retries:
        try:
            chat_completion = client.chat.completions.create(
                model=llm,
                messages=messages,
                stream=stream,
                temperature=temperature,
            )
            return prepare_response(chat_completion, stream=stream)
        except Exception as e:
            print(f"Exception occured: {e}")
            time.sleep(retry_interval)
            retry_count += 1

    return ""
    

**Note**: We’re using a temperature of 0.0 to enable reproducible experiments but you should adjust this based on your use case. For use cases that need to always be factually grounded, we recommend very low temperature values while more creative tasks can benefit from higher temperatures.

In [21]:
context_results = semantic_search(query=query, embedding_model=embedding_model, num_chunks=5)
context = [item["text"] for item in context_results]
print(context)

['The actual size of the batch provided to fn may be smaller than\nbatch_size if batch_size doesn’t evenly divide the block(s) sent\nto a given map task. Default batch_size is 1024 with “default”.\ncompute – This argument is deprecated. Use concurrency argument.', 'batch_size.\nNote\nThe default batch size depends on your resource type. If you’re using CPUs,\nthe default batch size is 4096. If you’re using GPUs, you must specify an explicit\nbatch size.', '# Specify that each input batch should be of size 2.\nds.map_batches(assert_batch, batch_size=2)\nCaution\nThe default batch_size of 4096 may be too large for datasets with large rows\n(for example, tables with many columns or a collection of large images).', 'Configuring Batch Size#\nConfigure the size of the input batch that’s passed to __call__ by setting the batch_size argument for ds.map_batches()', 'batch_size=64,\n        shuffle=True)']


In [33]:
# Generate response
query = "What is the default batch size for map_batches?"
response = generate_response(
    # llm="meta-llama/Llama-2-70b-chat-hf",
    llm="gpt-3.5-turbo",
    temperature=0.0,
    stream=True,
    system_content="Answer the query using the context provided. Be succinct.",
    user_content=f"query: {query}, context: {context}")
# Stream response
for content in response:
    print(content, end='', flush=True)

<openai.OpenAI object at 0x29a6cbad0>
Exception occured: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Exception occured: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}


# Conversation Agent

Let's combine context retrieval and answer response generation into a conventient query agent that we can easily use to generate our responses. This Conversational query agent will take care of setting up our agent (embedding model and LLM agent), as well as the context retrieval part, and pass it to the LLM agent for response generation.

In [39]:
from rag.embed import get_embedding_model
from rag.utils import get_num_tokens, trim
from rag.config import MAX_CONTEXT_LENGTHS

In [None]:
from typing import Any


class QueryAgent:
    def __init__(self, embedding_model_name="thenlper/gte-base",
                 llm="gpt-3.5-turbo", temperature=0.0, 
                 max_content_length=4096, system_content="", assistant_content="") -> None:
        self.embedding_model = get_embedding_model(
            embedding_model_name=embedding_model_name,
            model_kwargs={"device":"cpu"},
            encode_kwargs={"device": "cpu", "batch_size": 100}
        )

        # Context length (restrict input length to 50% of total context length)
        max_context_length = int(0.5*max_context_length)

        #LLM
        self.llm = llm
        self.temperature = temperature
        self.context_length = max_context_length - get_num_tokens(system_content + assistant_content)
        self.system_content = system_content
        self.assistant_content = assistant_content


    def __call__(self, query, num_chunks=5, stream=True) -> Any:
        # Get Sources and Context
        context_results = semantic_search(
            query=query,
            embedding_model=self.embedding_model,
            num_chunks=num_chunks
        )

        #Generate Response
        context = [item["text"] for item in context_results]
        sources = [item["source"] for item in context_results]
        user_content = f"query: {query}, context: {context}"

        answer = response = generate_response(
            llm=self.llm,
            temperature=self.temperature,
            stream=stream,
            system_content=self.system_content,
            user_content=trim(user_content, self.context_length)
        )

        #Result
        result  =  {
            "question": query,
            "sources": sources,
            "answer": answer,
            "llm": self.llm,
        }
        return result


With this, we can use our RAG application in just a few lines:

In [38]:
embedding_model_name = "thenlper/gte-base"
llm = "gpt-3.5-turbo"

In [None]:
query = "What is the default batch size for map_batches?"
system_content = "Answer the query using the context provided. Be succinct."
agent = QueryAgent(
    embedding_model_name=embedding_model_name,
    llm=llm,
    max_context_length=MAX_CONTEXT_LENGTHS[llm],
    system_content=system_content)
result = agent(query=query, stream=False)
print(json.dumps(result, indent=2))