## MongoDB and Anyscale Integration with LangChain

In [None]:
!pip install sqlalchemy==2.0.0 pypdf pymongo

In [2]:
from pymongo import MongoClient

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

DB_NAME = "MONGODB_NAME"
COLLECTION_NAME = "COLLECTION_NAME"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "SEARCH_INDEX_NAME"

MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

### Document loading

In [3]:
from langchain_community.document_loaders import PyPDFLoader

# Load the PDF
loader = PyPDFLoader("./ray.pdf")
data = loader.load()

### Text chunking

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

### Insert Document into the MongoDB Atlas with Anyscale Embedding

In [5]:
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.embeddings import AnyscaleEmbeddings

# insert the documents in MongoDB Atlas with their embedding
x = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=AnyscaleEmbeddings(anyscale_api_key=ANYSCALE_API_TOKEN),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

Downloading:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

2024-03-21 08:13:49.689300: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-21 08:13:51.657743: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-03-21 08:13:51.657962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


### Create Vector Search from Atlas

In [6]:
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    MONGODB_ATLAS_CLUSTER_URI,
    DB_NAME + "." + COLLECTION_NAME,
    embedding=AnyscaleEmbeddings(anyscale_api_key=ANYSCALE_API_TOKEN),
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)

In [8]:
from langchain.chains import RetrievalQA

from langchain_community.chat_models import ChatAnyscale

question="who is the creator of Ray"

anyscale = ChatAnyscale(anyscale_api_key=ANYSCALE_API_TOKEN,
                        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
                        temperature=0)

### Without context, LLM couldn't answer the question about Ray correctly

In [13]:
anyscale.invoke(question)

AIMessage(content=" I am a large language model trained by Mistral AI, and I don't have the ability to create individual entities or objects, such as characters in movies or books. As for the character Ray, I'm afraid I don't have specific information about his creator, as he could be a character from various sources. Could you please specify the Ray character you're referring to? This will help me provide a more accurate and helpful response.\n\nIn general, my main purpose is to provide accurate and reliable information while ensuring a positive and respectful interaction. I follow the principles of truthful, high-quality, and beneficial responses, as well as avoiding harmful, unethical, or prejudiced content. I am committed to promoting fairness and positivity in all my interactions.")

### With vector search and retrieval, now it can answer it correctly

In [14]:
qa_chain = RetrievalQA.from_chain_type(anyscale, retriever=vector_search.as_retriever())
qa_chain({"query": question})

{'query': 'who is the creator of Ray',
 'result': ' The creator of Ray is Robert Nishihara. He is a research scientist at UC Berkeley and a co-founder of the company Anyscale, which aims to democratize AI applications by bringing Ray to more users. Ray is a unified framework for distributed computing that unifies tasks and actors through a shared object store, allowing developers to express a wide range of parallelism and leverage both task and actor abstractions. It has been used by many people and several companies are running it in production. Ray provides a powerful combination of flexibility, performance, and ease of use for the development of future AI applications.'}