In [12]:
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_core.runnables import RunnablePassthrough

In [3]:
llm = OllamaLLM(model="llama3.2")
embedding = OllamaEmbeddings(model= "nomic-embed-text")

In [7]:

file_path = "/home/bishwayansaha99/langchain/docs/attention.pdf"

loader = PyPDFLoader(file_path)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
doc_chunks = splitter.split_documents(documents)
print("===== Creating New Vector Database =====")
db = Chroma.from_documents(
    documents=doc_chunks,
    embedding=embedding
)

retriever = db.as_retriever(embedding=embedding)

===== Creating New Vector Database =====


In [10]:
prompt_template = ChatPromptTemplate.from_template(
    """
    For the given question, try to generate a hypothetical answer with your knowledge.
    Only return the answer.
    ------------------------
    Question: {question}
"""
)
hypothetical_chain = prompt_template | llm | StrOutputParser()
hypothetical_answer = hypothetical_chain.invoke({'question': 'What is attention in transformer model?'})
print(hypothetical_answer)


In a transformer model, attention allows the neural network to weigh the importance of different input elements when processing sequences. It's a mechanism that enables the model to focus on the most relevant parts of the input data, rather than considering every element equally. This is achieved through self-attention mechanisms or multi-head attention, which compute weights for every pair of input tokens and sum them up to produce an output.


In [13]:
def format_response(docs):
    return "\n\n".join([doc.page_content for doc in docs])
rag_prompt = hub.pull("rlm/rag-prompt")
rag_chain = {
    'context': retriever | format_response,
    'question': RunnablePassthrough()
} | rag_prompt | llm | StrOutputParser()



In [14]:
rag_chain.invoke(hypothetical_answer)

'The Transformer model uses self-attention mechanisms to weigh the importance of different input elements when processing sequences. This is achieved through self-attention or multi-head attention, which compute weights for every pair of input tokens. Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions.'