<a href="https://colab.research.google.com/github/anantsrivast/vs_rag_wkshp/blob/main/vector_search_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/anantsrivast/vs_rag_wkshp


In [None]:
from IPython import get_ipython
get_ipython().run_cell_magic('javascript', '', 'IPython.notebook.clear_all_output();')


In [7]:
from ipywidgets import Widget
Widget.close_all()


In [None]:

!pip install -r "/content/vs_rag_wkshp/requirements.txt" --upgrade --no-cache-dir

In [None]:
import os
from pymongo import MongoClient

In [None]:
# Retain the quotes ("") when pasting the URI
from google.colab import userdata
MONGODB_URI = userdata.get('mongo_uri')
# Initialize a MongoDB Python client
mongodb_client = MongoClient(MONGODB_URI, appname="devrel.workshop.rag")
# Check the connection to the server
mongodb_client.admin.command("ping")

In [None]:
# You may see a warning upon running this cell. You can ignore it.
import pandas as pd
from datasets import load_dataset

In [None]:
# Download the `mongodb-docs` dataset from Hugging Face
data = load_dataset("mongodb/mongodb-docs", split="train")
# Convert the dataset into a dataframe first, then into a list of Python objects/dictionaries
docs = pd.DataFrame(data).to_dict("records")

In [None]:
# Note the number of documents in the dataset
len(docs)

In [None]:
# Preview a document to understand its structure
docs[0]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Dict, List

In [None]:
# Common list of separators for text data
separators = ["\n\n", "\n", " ", "", "#", "##", "###"]

In [None]:
# Use the `RecursiveCharacterTextSplitter` from LangChain to first split a piece of text on the list of `separators` above.
# Then recursively merge them into tokens until the specified chunk size is reached.
# For text data, you typically want to keep 1-2 paragraphs (~200 tokens) in a single chunk.
# Chunk overlap of 15-20% of the chunk size is recommended to maintain context between chunks.
# The `model_name` parameter indicates which encoder to use for tokenization, in this case GPT-4's encoder.
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4", separators=separators, chunk_size=200, chunk_overlap=30
)

In [None]:
def get_chunks(doc: Dict, text_field: str) -> List[Dict]:
    """
    Chunk up a document.

    Args:
        doc (Dict): Parent document to generate chunks from.
        text_field (str): Text field to chunk.

    Returns:
        List[Dict]: List of chunked documents.
    """
    # Extract the field to chunk from `doc`
    text = doc[text_field]
    # Split `text` using the appropriate method of the `RecursiveCharacterTextSplitter` class
    # NOTE: `text` is a string
    chunks = text_splitter.split_text(text)

    # Iterate through `chunks` and for each chunk:
    # 1. Create a shallow copy of `doc`, call it `temp`
    # 2. Set the `text_field` field in `temp` to the content of the chunk
    # 3. Append `temp` to `chunked_data`
    chunked_data = []
    for chunk in chunks:
        temp = doc.copy()
        temp[text_field]=chunk
        chunked_data.append(temp)

    return chunked_data

In [None]:
split_docs = []
# Iterate through `docs`, use the `get_chunks` function to chunk up the "body" field in the documents, and add the list of chunked documents to `split_docs` initialized above.
for doc in docs:
    chunks = get_chunks(doc,"body")
    split_docs.extend(chunks)

In [None]:
# Notice that the length of `split_docs` is greater than the length of `docs` from Step 2 above
# This is because each document in `docs` has been split into multiple chunks
len(split_docs)

In [None]:
# Preview a chunked document to understand its structure
# Note that the structure looks similar to the original docs, except the `body` field now contains smaller chunks of text
split_docs[0]

In [None]:
# Load the `gte-small` model using the Sentence Transformers library
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
embedding_model = SentenceTransformer("thenlper/gte-small")

In [None]:
# Define a function that takes a piece of text (`text`) as input, embeds it using the `embedding_model` instantiated above and returns the embedding as a list
# An array can be converted to a list using the `tolist()` method
def get_embedding(text: str) -> List[float]:
    """
    Generate the embedding for a piece of text.

    Args:
        text (str): Text to embed.

    Returns:
        List[float]: Embedding of the text as a list.
    """
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [None]:
embedded_docs = []
# Add an `embedding` field to each dictionary in `split_docs`
# The `embedding` field should correspond to the embedding of the value of the `body` field
# Use the `get_embedding` function defined above to generate the embedding
# Append the updated dictionaries to `embedded_docs` initialized above.
for doc in tqdm(split_docs):
    doc["embedding"]= get_embedding(doc["body"])
    embedded_docs.append(doc)

In [None]:
# Check that the length of `embedded_docs` is the same as that of `split_docs`
len(embedded_docs)

In [None]:
def generate_unique_db_name(prefix="mongodb_genai_devday_rag"):
    """Generate a unique database name with timestamp and UUID"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_id = str(uuid.uuid4())[:8]  # Use first 8 characters of UUID
    return f"{prefix}_{timestamp}_{unique_id}"


In [None]:
# Name of the database -- Change if needed or leave as is
import uuid
from datetime import datetime
DB_NAME = generate_unique_db_name()
# Name of the collection -- Change if needed or leave as is
COLLECTION_NAME = "knowledge_base"
# Name of the vector search index -- Change if needed or leave as is
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

In [None]:
db = mongodb_client[DB_NAME]

In [None]:
# Connect to the `COLLECTION_NAME` collection.
# Use the `db` and collection name defined above.
collection = db[COLLECTION_NAME]

In [None]:
# Bulk delete all existing records from the collection defined above
collection.delete_many({})

In [None]:
# Bulk insert `embedded_docs` into the collection defined above -- should be a one-liner
collection.insert_many(embedded_docs)

print(f"Ingested {collection.count_documents({})} documents into the {COLLECTION_NAME} collection.")

In [None]:
# Create vector index definition specifying:
# path: Path to the embeddings field
# numDimensions: Number of embedding dimensions- depends on the embedding model used
# similarity: Similarity metric. One of cosine, euclidean, dotProduct.
model = {
    "name": ATLAS_VECTOR_SEARCH_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 384,
                "similarity": "cosine",
            }
        ]
    },
}

In [None]:
# Create a vector search index with the above definition for the `collection` collection
collection.create_search_index(model)

In [None]:
# Define a function to retrieve relevant documents for a user query using vector search
def vector_search(user_query: str) -> List[Dict]:
    """
    Retrieve relevant documents for a user query using vector search.

    Args:
    user_query (str): The user's query string.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the `user_query` using the `get_embedding` function defined in Step 4
    query_embedding = get_embedding(user_query)

    # Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage
    # Set the number of candidates to 150 and only return the top 5 documents from the vector search
    # In the $project stage, exclude the `_id` field and include only the `body` field and `vectorSearchScore`
    # NOTE: Use variables defined previously for the `index`, `queryVector` and `path` fields in the $vectorSearch stage
    pipeline = [{"$vectorSearch" : {"queryVector": query_embedding,
            "path": "embedding",
            "numCandidates": 100,  # controls the search scope
            "limit": 5,            # top K results
            "index": ATLAS_VECTOR_SEARCH_INDEX_NAME}}]

    # Execute the aggregation `pipeline` and store the results in `results`
    results = collection.aggregate(pipeline)
    return list(results)

In [None]:
vector_search("What are some best practices for data backups in MongoDB?")

In [None]:
!wget -O phi-2.gguf https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf


In [None]:
#!pip install langchain-ollama

from llama_cpp import Llama


llm = Llama(model_path="phi-2.gguf", n_ctx=1024)

In [None]:
# Define a function to create the user prompt for our RAG application
def create_prompt(user_query: str) -> str:
    """
    Create a chat prompt that includes the user query and retrieved context.

    Args:
        user_query (str): The user's query string.

    Returns:
        str: The chat prompt string.
    """
    # Retrieve the most relevant documents for the `user_query` using the `vector_search` function defined in Step 7
    context = vector_search(user_query)
    # Join the retrieved documents into a single string, where each document is separated by two new lines ("\n\n")
    context = "\n\n".join([doc.get('body') for doc in context])
    # Prompt consisting of the question and relevant context to answer it
    prompt = f"Answer the question based only on the following context. If the context is empty, say I DON'T KNOW\n\nContext:\n{context}\n\nQuestion:{user_query}"
    return prompt

In [None]:
def format_chat_prompt(messages):
    """Formats a list of role/content messages into a prompt string"""
    prompt = ""
    for msg in messages:
        if msg["role"] == "system":
            prompt += f"<|system|>\n{msg['content']}\n"
        elif msg["role"] == "user":
            prompt += f"<|user|>\n{msg['content']}\n"
        elif msg["role"] == "assistant":
            prompt += f"<|assistant|>\n{msg['content']}\n"
    prompt += "<|assistant|>\n"
    return prompt

In [None]:
# Define a function to answer user queries
def generate_answer(user_query: str) -> None:
    """
    Generate an answer to the user query.

    Args:
        user_query (str): The user's query string.
    """
    # Use the `create_prompt` function above to create a chat prompt
    prompt = create_prompt(user_query)
    # Format the message to the LLM in the format {"role": <role_value>, "content": <content_value>}
    # The role value for user messages must be "user"
    # Use the `prompt` created above to populate the `content` field in the chat message
    messages = [{"role": "system", "content": prompt}]
    # Send the chat messages to LLM
    prompt=format_chat_prompt(messages)
    #print(prompt)
    response = llm(prompt, max_tokens=500)
    # Print the response
    #response = requests.post(url, json=messages)
    print(response["choices"][0]["text"])
    #print(response.json()['response'])
    # print(response.json()["text"])

In [None]:
generate_answer("What are some best practices for data backups in MongoDB?")