In [None]:
%pip install -U --quiet langchain langsmith langchainhub langchain_benchmarks
%pip install --quiet chromadb openai huggingface pandas langchain_experimental sentence_transformers pyarrow anthropic tiktoken

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.6 requires langchain-core<0.2.0,>=0.1.46, but you have langchain-core 0.2.10 which is incompatible.
langchain-experimental 0.0.62 requires langchain-community<0.3.0,>=0.2.6, but you have langchain-community 0.0.20 which is incompatible.
langchain-community 0.0.20 requires langchain-core<0.2,>=0.1.21, but you have langchain-core 0.2.10 which is incompatible.
langchain-community 0.0.20 requires langsmith<0.1,>=0.0.83, but you have langsmith 0.1.82 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os

os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "LANGCHAIN_API_KEY_HERE"

In [None]:
# Update these with your own API keys
os.environ["ANTHROPIC_API_KEY"] = "ANTHROPIC_API_KEY_HERE"
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY_HERE"
# Silence warnings from HuggingFace
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import uuid

# Generate a unique run ID for this experiment
run_uid = uuid.uuid4().hex[:6]

In [None]:
from langchain_benchmarks import clone_public_dataset, registry
from langsmith import traceable

In [None]:
registry = registry.filter(Type="RetrievalTask")
registry

Name,Type,Dataset ID,Description
LangChain Docs Q&A,RetrievalTask,452ccafc-18e1-4314-885b-edd735f17b9d,Questions and answers based on a snapshot of the LangChain python docs. The environment provides the documents and the retriever information. Each example is composed of a question and reference answer. Success is measured based on the accuracy of the answer relative to the reference answer. We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Semi-structured Reports,RetrievalTask,c47d9617-ab99-4d6e-a6e6-92b8daf85a7d,Questions and answers based on PDFs containing tables and charts. The task provides the raw documents as well as factory methods to easily index them and create a retriever. Each example is composed of a question and reference answer. Success is measured based on the accuracy of the answer relative to the reference answer. We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Multi-modal slide decks,RetrievalTask,40afc8e7-9d7e-44ed-8971-2cae1eb59731,This public dataset is a work-in-progress and will be extended over time.  Questions and answers based on slide decks containing visual tables and charts. Each example is composed of a question and reference answer. Success is measured based on the accuracy of the answer relative to the reference answer.


In [None]:
langchain_docs = registry["LangChain Docs Q&A"]
langchain_docs

0,1
Name,LangChain Docs Q&A
Type,RetrievalTask
Dataset ID,452ccafc-18e1-4314-885b-edd735f17b9d
Description,Questions and answers based on a snapshot of the LangChain python docs. The environment provides the documents and the retriever information. Each example is composed of a question and reference answer. Success is measured based on the accuracy of the answer relative to the reference answer. We also measure the faithfulness of the model's response relative to the retrieved documents (if any).
Retriever Factories,"basic, parent-doc, hyde"
Architecture Factories,conversational-retrieval-qa
get_docs,


In [None]:
@traceable
def langchain_docs_query(question: str):
    return langchain_docs.query(question)
clone_public_dataset(langchain_docs.dataset_id,
                     dataset_name=langchain_docs.name)

Dataset LangChain Docs Q&A already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/c9534f73-04c8-5644-b464-e311feca6a81/datasets/ec76a09e-14d8-44c5-bced-7e1bbe582546.


In [None]:
docs = list(langchain_docs.get_docs())
print(repr(docs[0])[:100] + "...")

Document(page_content="LangChain cookbook | 🦜️🔗 Langchain\n\n[Skip to main content](#docusaurus_skip...


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-base",
    # model_kwargs={"device": 0},  # Comment out to use CPU
)

vectorstore = Chroma(
    collection_name="lcbm-b-huggingface-gte-base",
    embedding_function=embeddings,
    persist_directory="./chromadb",
)

vectorstore.add_documents(docs)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

  warn_deprecated(


In [None]:
from operator import itemgetter
from typing import Sequence

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable.passthrough import RunnableAssign


# After the retriever fetches documents, this
# function formats them in a string to present for the LLM
@traceable
def format_docs(docs: Sequence[Document]) -> str:
    formatted_docs = []
    for i, doc in enumerate(docs):
        doc_string = (
            f"<document index='{i}'>\n"
            f"<source>{doc.metadata.get('source')}</source>\n"
            f"<doc_content>{doc.page_content}</doc_content>\n"
            "</document>"
        )
        formatted_docs.append(doc_string)
    formatted_str = "\n".join(formatted_docs)
    return f"<documents>\n{formatted_str}\n</documents>"


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an AI assistant answering questions about LangChain."
            "\n{context}\n"
            "Respond solely based on the document content.",
        ),
        ("human", "{question}"),
    ]
)
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=1)

response_generator = (prompt | llm | StrOutputParser()).with_config(
    run_name="GenerateResponse",
)

# This is the final response chain.
# It fetches the "question" key from the input dict,
# passes it to the retriever, then formats as a string.

chain = (
    RunnableAssign(
        {
            "context": (itemgetter("question") | retriever | format_docs).with_config(
                run_name="FormatDocs"
            )
        }
    )
    # The "RunnableAssign" above returns a dict with keys
    # question (from the original input) and
    # context: the string-formatted docs.
    # This is passed to the response_generator above
    | response_generator
)

  warn_deprecated(


In [None]:
chain.invoke({"question": "Tell me how a chain works in LangChain. In 3 sentences."})

'A chain in LangChain is a sequence of operations executed in response to an input. It consists of a series of customizable components (such as language models, memory stores, and callbacks) that work together to generate outputs. Chains in LangChain can be used for various tasks, such as having a conversation, loading context from memory, and more, making them versatile and powerful tools for building AI workflows.'