[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mongodb-developer/ai-agents-lab-notebooks/blob/main/notebook_template.ipynb)


[![Lab Documentation and Solutions](https://img.shields.io/badge/Lab%20Documentation%20and%20Solutions-purple)](https://mongodb-developer.github.io/rag-lab/)


# Step 1: Install libraries


In [48]:
! pip install -qU pymongo langchain langchain-community langchain-mongodb bs4 tiktoken sentence_transformers

# Step 2: Setup prerequisites


In [79]:
MONGODB_URI = "<CODE_BLOCK_1>"

# Step 3: Create a knowledge base


### Load the dataset using LangChain WebBaseLoader


In [22]:
from langchain_community.document_loaders import WebBaseLoader
from pymongo import MongoClient

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [59]:
loader = WebBaseLoader(
    [
        "https://www.mongodb.com/developer/products/atlas/choose-embedding-model-rag/",
        "https://www.mongodb.com/developer/products/atlas/evaluate-llm-applications-rag/",
        "https://www.mongodb.com/developer/products/atlas/choosing-chunking-strategy-rag/",
        "https://www.mongodb.com/developer/products/atlas/gemma-mongodb-huggingface-rag/",
    ]
)
docs = loader.load()

In [60]:
docs[0].page_content

'How to Choose the Right Embedding Model for Your LLM Application | MongoDBBlogAtlas Vector Search voted most loved vector database in 2024 Retool State of AI reportLearn more\xa0>>Developer Articles & TopicsGeneral InformationDocumentationDeveloper Articles & TopicsCommunity ForumsBlogUniversityProductsPlatformAtlasBuild on a developer data platformPlatform ServicesDatabaseDeploy a multi-cloud databaseSearchDeliver engaging search experiencesVector SearchDesign intelligent apps with GenAIStream ProcessingUnify data in motion and data at restToolsCompassWork with MongoDB data in a GUIIntegrationsIntegrations with third-party servicesRelational MigratorMigrate to MongoDB with confidenceSelf ManagedEnterprise AdvancedRun and manage MongoDB yourselfCommunity EditionDevelop locally with MongoDBBuild with MongoDB AtlasGet started for free in minutesSign UpTest Enterprise AdvancedDevelop with MongoDB on-premisesDownloadTry Community EditionExplore the latest version of MongoDBDownloadResourc

In [61]:
docs[0].metadata

{'source': 'https://www.mongodb.com/developer/products/atlas/choose-embedding-model-rag/',
 'title': 'How to Choose the Right Embedding Model for Your LLM Application | MongoDB',
 'description': 'In this tutorial, we will see why embeddings are important for RAG, and how to choose the right embedding model for your RAG application.',
 'language': 'en'}

### Chunk up the data


In [62]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [63]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", chunk_size=200, chunk_overlap=30
)

In [64]:
split_docs = text_splitter.split_documents(docs)

In [65]:
len(split_docs)

202

In [66]:
split_docs = [doc.dict() for doc in split_docs]

In [67]:
split_docs[0]

{'id': None,
 'metadata': {'source': 'https://www.mongodb.com/developer/products/atlas/choose-embedding-model-rag/',
  'title': 'How to Choose the Right Embedding Model for Your LLM Application | MongoDB',
  'description': 'In this tutorial, we will see why embeddings are important for RAG, and how to choose the right embedding model for your RAG application.',
  'language': 'en'},
 'page_content': 'How to Choose the Right Embedding Model for Your LLM Application | MongoDBBlogAtlas Vector Search voted most loved vector database in 2024 Retool State of AI reportLearn more\xa0>>Developer Articles & TopicsGeneral InformationDocumentationDeveloper Articles & TopicsCommunity ForumsBlogUniversityProductsPlatformAtlasBuild on a developer data platformPlatform ServicesDatabaseDeploy a multi-cloud databaseSearchDeliver engaging search experiencesVector SearchDesign intelligent apps with GenAIStream ProcessingUnify data in motion and data at restToolsCompassWork with MongoDB data in a GUIIntegra

### Generate embeddings


In [68]:
from sentence_transformers import SentenceTransformer

In [69]:
embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [70]:
def get_embedding(text: str):
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [71]:
embedded_docs = [
    {**d, "embedding": get_embedding(d["page_content"])} for d in split_docs
]

In [75]:
embedded_docs = []
for doc in split_docs:
    temp = doc.copy()
    temp["embedding"] = get_embedding(temp["page_content"])
    embedded_docs.append(temp)

### Ingest documents into MongoDB


In [80]:
# Initialize a MongoDB Python client
client = MongoClient(MONGODB_URI)

In [81]:
# Name of the database -- Change if needed or leave as is
DB_NAME = "mongodb_rag_lab"
# Name of the collection -- Change if needed or leave as is
COLLECTION_NAME = "knowledge_base"
# Name of the vector search index -- Change if needed or leave as is
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

In [82]:
# Connect to the collection defined above using the MongoDB client
collection = client[DB_NAME][COLLECTION_NAME]

In [83]:
# Bulk delete all existing records from the collection defined above -- should be a one-liner
collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff000000000000000c'), 'opTime': {'ts': Timestamp(1720477037, 15), 't': 12}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1720477037, 15), 'signature': {'hash': b'\x04\xff@#\xf6\xe7\x7f\xcc\xab|zrR\x94\x7f\xe9,\x9a\xe0\xb6', 'keyId': 7353010953081847814}}, 'operationTime': Timestamp(1720477037, 15)}, acknowledged=True)

In [84]:
# Bulk insert `records` into the collection defined above -- should be a one-liner
collection.insert_many(embedded_docs)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed
