In [2]:
!pip install langchain_nvidia_ai_endpoints

Collecting langchain_nvidia_ai_endpoints
  Downloading langchain_nvidia_ai_endpoints-0.0.4-py3-none-any.whl (33 kB)
Collecting langchain-core<0.2.0,>=0.1.5 (from langchain_nvidia_ai_endpoints)
  Downloading langchain_core-0.1.33-py3-none-any.whl (269 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.1/269.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pillow<11.0.0,>=10.0.0 (from langchain_nvidia_ai_endpoints)
  Downloading pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.2.0,>=0.1.5->langchain_nvidia_ai_endpoints)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-core<0.2.0,>=0.1.5->langchain_nvidia_ai_endpoints)
  Downloading langsmith-0.1.31-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA

In [3]:
!pip install langchain-community langchain-text-splitters faiss-cpu



In [4]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://nvidianews.nvidia.com/news/generative-ai-microservices-for-developers/")
docs = loader.load()

In [6]:
docs

[Document(page_content="\n\n\n\n\n\n\n\nNVIDIA Launches Generative AI Microservices for Developers to Create and Deploy Generative AI Copilots Across NVIDIA CUDA GPU Installed Base | NVIDIA Newsroom\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n Artificial Intelligence Computing Leadership from NVIDIA\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPLATFORMS\n\n\n  Autonomous Machines\n\n\n\n  Cloud & Data Center\n\n\n\n  Deep Learning & Ai\n\n\n\n  Design & Pro Visualization\n\n\n\n  Healthcare\n\n\n\n  High Performance Computing\n\n\n\n  Self-Driving Cars\n\n\n\n  Gaming & Entertainment\n\n\n\n\n\nother links\n\n\nDevelopers\nIndustries\nShop\nDrivers\nSupport\nAbout NVIDIA\n\n\nView All Products\nGPU TECHNOLOGY CONFERENCE\nNVIDIA Blog\nCommunity\nCareers\nTECHNOLOGIES\n\n\n\n\n\n\n\n\n\nWatch NVIDIA CEO Jensen Huang's GTC keynote to catch all the announcements and more.\nWatch Now\nDismiss\n\n\n\n\n\n\n\

In [5]:
from google.colab import userdata
import os
os.environ['NVIDIA_API_KEY'] = userdata.get('NVIDIA_API_KEY')
embeddings = NVIDIAEmbeddings()

In [6]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)
retriever = vector.as_retriever()

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
model = ChatNVIDIA(model="mistral_7b")

In [8]:
model

ChatNVIDIA(client=NVEModel(base_url='https://api.nvcf.nvidia.com/v2/nvcf', get_session_fn=<class 'requests.sessions.Session'>, get_asession_fn=<class 'aiohttp.client.ClientSession'>, endpoints={'infer': '{base_url}/pexec/functions/{model_id}', 'status': '{base_url}/pexec/status/{request_id}', 'models': '{base_url}/functions'}, api_key=SecretStr('**********'), is_staging=False, timeout=60, interval=0.02, last_inputs={}, last_response=None, payload_fn=<function NVEModel.<lambda> at 0x7aeb261169e0>, headers_tmpl={'call': {'Accept': 'application/json', 'Authorization': 'Bearer {api_key}', 'User-Agent': 'langchain-nvidia-ai-endpoints'}, 'stream': {'Accept': 'text/event-stream', 'content-type': 'application/json', 'Authorization': 'Bearer {api_key}', 'User-Agent': 'langchain-nvidia-ai-endpoints'}}, stagify=functools.partial(<function NVEModel._stagify at 0x7aeb261171c0>, is_staging=False)), model='mistral_7b')

In [9]:
hyde_template = """Even if you do not know the full answer, generate a one-paragraph hypothetical answer to the below question:
{question}"""
hyde_prompt = ChatPromptTemplate.from_template(hyde_template)
hyde_query_transformer = hyde_prompt | model | StrOutputParser()

In [10]:
from langchain_core.runnables import chain

@chain
def hyde_retriever(question):
    hypothetical_document = hyde_query_transformer.invoke({"question": question})
    return retriever.invoke(hypothetical_document)

In [11]:
template = """Answer the question based only on the following context:
{context}
If the question doesn't match with the context of it's a greeting then answer accordingly. But don't answer about topics other than the topics in the doc
Question: {question}
Always answer in points
"""

prompt = ChatPromptTemplate.from_template(template)
answer_chain = prompt | model | StrOutputParser()

In [12]:
@chain
def final_chain(question):
    documents = hyde_retriever.invoke(question)
    for s in answer_chain.stream({"question": question, "context": documents}):
        yield s

In [13]:
for s in final_chain.stream("Tell me about NVIDIA NIM"):
    print(s, end="")

1. NVIDIA NIM (NVIDIA Inference Microservices) is a set of pre-built containers powered by NVIDIA inference software.
2. It includes Triton Inference Server™ and TensorRT™-LLM for reducing deployment times from weeks to minutes.
3. NVIDIA NIM provides the fastest and highest-performing production AI container for deploying models from various sources like NVIDIA, Google, Hugging Face, Microsoft, and open models.
4. NVIDIA NIM microservices can be accessed from popular platforms like Amazon SageMaker, Google Kubernetes Engine, and Microsoft Azure AI.
5. They can be integrated with popular AI frameworks like Deepset, LangChain, and LlamaIndex.
6. CUDA-X microservices provide end-to-end building blocks for data preparation, customization, and training to speed production AI development across industries.