In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['TAVILY_API_KEY'] = os.getenv('TAVILY_API_KEY')

In [2]:
## Langsmith params for observability
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY')
os.environ['LANGSMITH_PROJECT'] = 'LLM_OBS_YT'
os.environ['LANGSMITH_TRACING']="true"

### RAG Vector DB Population

In [3]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('sample_doc.pdf')
docs = loader.load()

###  BGE Embddings

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

from langchain_community.vectorstores import Chroma

### Creating Retriever using Vector DB
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 3})

  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


### RAG with LangChain (LCEL)

In [4]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [5]:
def generate_response(question: str):

    template = """Answer the question based only on the following context:
        {context}

        Question: {question}
        """
    prompt = ChatPromptTemplate.from_template(template)

    retrieval_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
        )
    result = retrieval_chain.invoke(question)
    
    return result

In [6]:
response = generate_response("Tell me about mutlihead attention in transformers")
print(response)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Multi-head attention is a key component of the transformer architecture that enhances the model's ability to capture various patterns and relationships within input data. It operates by applying the scaled dot-product attention function in parallel across multiple attention heads. Each attention head processes the input using its own set of learnable weights to generate different representations of the input data.

Mathematically, the attention outputs from each head are concatenated and transformed into a single matrix to form a comprehensive representation of the input. This approach allows the model to learn and integrate information from various segments of the input sequence effectively. By utilizing multiple heads, the multi-head attention mechanism enables the transformer to jointly attend to information from different perspectives, which results in improved network performance in tasks such as natural language processing. 

Overall, multi-head attention enhances the representat

### LangGraph Agent with RAG + WebSearch (MultiAgent Supervisor)

In [18]:
## Langsmith params for observability
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY')
os.environ['LANGSMITH_PROJECT'] = 'LLM_OBS_YT'
os.environ['LANGSMITH_TRACING']="true"

In [19]:
from langgraph.prebuilt import create_react_agent

### Tools Creation

In [9]:
from langchain_community.tools.tavily_search import TavilySearchResults

tavily_tool = TavilySearchResults(max_results=5)

  tavily_tool = TavilySearchResults(max_results=5)


### Create specialized Agents

In [10]:
### Research Agent for Web Search

def web_search(query: str) -> str:
    """Search the web for information."""
    docs = tavily_tool.invoke({"query": query})
    web_results = "\n".join([d["content"] for d in docs])
    return web_results

research_agent = create_react_agent(
    model=llm,
    tools=[web_search],
    name="research_expert",
   prompt="You are a world class researcher with access to web search."
)

In [12]:
## RAG Agent

def rag_search(query:str):
    "Function to do RAG search"
    docs = retriever.invoke(
            query,
        )
    return "\nRetrieved documents:\n" + "".join(
        [
            f"\n\n===== Document {str(i)} =====\n" + doc.page_content
            for i, doc in enumerate(docs)
        ]
    )

rag_agent = create_react_agent(
    model=llm,
    tools=[rag_search],
    name="rag_expert",
    prompt="You are a RAG tool with access to transformer applications on Deep Learning related tasks."
)

In [20]:
from langgraph_supervisor import create_supervisor

workflow = create_supervisor(
    agents=[research_agent, rag_agent],
    model=llm,
    prompt=(
        "You are a supervisor managing a web search expert and a RAG search expert. "
        "For current events and information, use research_agent."
        "For transformer related information , use rag_agent."
    )
)

# Compile and run
app = workflow.compile()

In [25]:
result = app.invoke({
    "messages": [
        {
            "role": "user",
            "content": "Tell me about mutlihead attention in transformers"
        }
    ]
})

Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel is_last_step, ignoring it.
Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel remaining_steps, ignoring it.


In [34]:
result['messages'][-1].content

'Multi-head attention is a key concept in transformer architectures, allowing models to focus on different parts of the input sequence through multiple attention heads. Each head learns to pay attention to different aspects of the input, facilitating the capture of richer and more diverse information. It plays a crucial role in improving the performance of models in various natural language processing tasks. If you need further details or specific applications, feel free to ask!'

In [21]:
result = app.invoke(
    {
    "messages": [
        {
            "role": "user",
            "content": "who is the winner of Last T20 Cricket World Cup?"
        }
    ]}
)

Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel is_last_step, ignoring it.
Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel remaining_steps, ignoring it.


In [22]:
result['messages'][-1].content 

"India won the last T20 Cricket World Cup, held in 2024, defeating South Africa by 7 runs in the final. This victory marks India's second T20 World Cup title, following their win in 2007."