In [13]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
# os.environ['TAVILY_API_KEY'] = os.getenv('TAVILY_API_KEY')

### Langsmith params for observability

In [14]:
## Langsmith params for observability
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY')
os.environ['LANGSMITH_PROJECT'] = 'LLM_OBS_YT'
os.environ['LANGSMITH_TRACING']="true"

In [15]:
from langsmith import traceable
from openai import OpenAI
from typing import List

openai_client = OpenAI()

### RAG Vector DB Population

In [16]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('sample_doc.pdf')
docs = loader.load()

###  BGE Embddings

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

from langchain_community.vectorstores import Chroma

### Creating Retriever using Vector DB
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 3})

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


### Trace with LangSmith and Types of Traces (useful when we don't use LangChain/LangGraph)

LangSmith supports many different types of Runs - you can specify what type your Run is in the @traceable decorator. The types of runs are:

- LLM: Invokes an LLM
- Retriever: Retrieves documents from databases or other sources
- Tool: Executes actions with function calls
- Chain: Default type; combines multiple Runs into a larger process
- Prompt: Hydrates a prompt to be used with an LLM
- Parser: Extracts structured data

In [17]:
@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

In [18]:
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {
            "role": "system",
            "content": "Answer the Question based only on the following Context. If the answer is not in the context, say 'I don't know'.",
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs} \n\n Question: {question}"
        }
    ]
    return llm(messages)

In [19]:
@traceable(run_type="llm")
def llm(messages: List[dict]):
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0.0
    )
    return response

In [20]:
@traceable(run_type="chain")
def rag_chain(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content

In [21]:
response = rag_chain("Tell me about mutlihead attention in transformers")

In [22]:
print(response)

Multi-head attention in transformers is a crucial component that allows the neural network to learn and capture diverse characteristics of input sequential data. It enhances the representation of input contexts by merging information from distinct features of the attention mechanism, which can operate over both short and long ranges. This approach enables the attention mechanism to function jointly, resulting in improved network performance.

In the multi-head attention module, the scaled dot-product attention function is applied in parallel across multiple heads. Each head performs the attention mechanism using its own set of learnable weights (WkQ, WkK, and WkV). The outputs from each head are then concatenated and linearly transformed into a single matrix with the expected dimension. This parallel execution of attention allows the model to capture different aspects of the input data simultaneously, contributing to the overall effectiveness of the transformer architecture.


In [None]:
response = rag_chain("Tell me about Encoder module in transformers")
print(response)