In [30]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the PDF document
file_path = "documents/langchain_docs.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

# Split the pages into chunks
text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=3000, chunk_overlap=500)


chunks = text_splitter.split_documents(pages)


In [31]:
chunks

[Document(metadata={'source': 'documents/langchain_docs.pdf', 'page': 0}, page_content="10/13/23, 2:00 PM Using LangSmith to Support Fine-tuning\nhttps://blog.langchain.dev/using-langsmith-to-support-fine-tuning-of-open-source-llms/ 1/15Using LangSmith to Suppo rt Fine-\ntuning\nBY LANGCHAIN 9 MIN READ AUG 23, 2023\nSummary\nWe created a guide for fine-tuning and evaluating LLMs using LangSmith for\ndataset management and evaluation. We did this both with an open source\nLLM on CoLab and HuggingFace for model training, as well as OpenAI's new\nfinetuning service. As a test case, we fine-tuned LLaMA2-7b-chat and gpt-3.5-\nSubscribe"),
 Document(metadata={'source': 'documents/langchain_docs.pdf', 'page': 1}, page_content="10/13/23, 2:00 PM Using LangSmith to Support Fine-tuning\nhttps://blog.langchain.dev/using-langsmith-to-support-fine-tuning-of-open-source-llms/ 2/15turbo for an extraction task (knowledge graph triple extraction) using training\ndata exported from LangSmith and also ev

In [32]:
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()
# Azure OpenAI configuration
token = os.getenv("GITHUB_TOKEN")
endpoint = "https://models.inference.ai.azure.com"
model_name = "gpt-4o-mini"

# Initialize the ChatOpenAI LLM
llm = ChatOpenAI(
    openai_api_key=token,
    openai_api_base=endpoint,
    model_name=model_name,
    temperature=1.0,
    max_tokens=4000
)

In [33]:
# 1. Load Documents
file_path = "documents/langchain_docs.pdf"
loader = PyPDFLoader(file_path)
data = loader.load()

# 2. Split Documents
text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=3000, chunk_overlap=500)
chunks = text_splitter.split_documents(data)


In [34]:
# 3. Store Chunks


embeddings = AzureOpenAIEmbeddings(
    model=model_name,
    azure_endpoint=endpoint,
    api_key=token
)

vectorstore = FAISS.from_documents(documents=chunks, embedding=embeddings)
retriever = vectorstore.as_retriever()

# 4. Retrieve Relevant Chunks
# (No additional code needed, the retriever is ready to use)


In [35]:
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain 
from langchain_core.prompts import ChatPromptTemplate

# 5. Generate Responses
system_prompt = """
Use the following pieces of retrieved context to answer the user's question. 
If the context doesn't contain any relevant information to answer the question, say "I don't know".
<context>
{context}
</context>

Answer the question:

Question: {question}
"""

qa_prompt = ChatPromptTemplate.from_template(system_prompt)

# Create the StuffDocumentsChain
question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=qa_prompt)




In [38]:
# Invoke the chain with a user's question and the retrieved chunks
result = question_answer_chain.invoke({
    "question": "what is langsmith?",
    "context": chunks
})
print(result)

LangSmith is a tool designed to support the fine-tuning and evaluation of large language models (LLMs). It aids in dataset management and evaluation processes, making it easier to collect, clean, and inspect data as well as evaluate the performance of fine-tuned models. LangSmith offers features like automatic logging of generations, a queryable interface for data selection, and assistance in running evaluations, thereby addressing common pain points in the fine-tuning workflow.
