In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Milvus
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser  # ✅ Parser for text output
from langchain_core.runnables import RunnableLambda, RunnablePassthrough  # For chaining

# 1. Load and split documents
loader = PyPDFLoader("attention.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(docs)

# 2. Generate embeddings
embedding = OpenAIEmbeddings()

# 3. Store in Milvus
milvus_vectorstore = Milvus.from_documents(
    documents[:30],
    embedding,
    connection_args={
        "host": "localhost",  # or your IP
        "port": "19530",
    },
    collection_name="attention_docs"
)

# 4. Prepare the LLM
llm = ChatOpenAI(model="gpt-4", temperature=0.7)

# 5. Prepare the prompt template
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}
""")

# 6. Create retriever
retriever = milvus_vectorstore.as_retriever()

# 7. Define a step to fetch context from retriever
def retrieve_context(inputs):
    docs = retriever.invoke(inputs["input"])
    return {"context": "\n\n".join([doc.page_content for doc in docs]), "input": inputs["input"]}

# 8. Create output parser
parser = StrOutputParser()

# 9. Chain: input -> retrieve -> format prompt -> LLM -> parse
chain = (
    RunnableLambda(retrieve_context) |
    prompt |
    llm |
    parser
)

# 10. Run the chain
response = chain.invoke({"input": "Scaled Dot-Product Attention"})
print(response)


The Scaled Dot-Product Attention is a type of attention mechanism in machine learning. The input for this mechanism consists of queries and keys of dimension dk, and values of dimension dv. The mechanism computes the dot products of the query with all keys, then divides each by the square root of dk, and applies a softmax function to determine the weights on the values. The attention function is computed on a set of queries simultaneously, which are packed together into a matrix Q. The keys and values are also packed together into matrices K and V. The output is computed as: Attention(Q, K, V) = softmax(QKT / sqrt(dk)) * V. This mechanism is similar to dot-product attention, except for the scaling factor of 1/sqrt(dk). It is faster and more space-efficient than additive attention because it can be implemented using highly optimized matrix multiplication code. However, for larger values of dk, the dot products can grow large in magnitude, which can push the softmax function into regions