In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
# from langchain_milvus import Milvus
from langchain_community.vectorstores import Milvus
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# 1. Load and split documents
loader = PyPDFLoader("attention.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(docs)

# 2. Generate embeddings
embedding = OpenAIEmbeddings()

# 3. Store in Milvus (connect to your Milvus instance)
milvus_vectorstore = Milvus.from_documents(
    documents[:30],
    embedding,
    connection_args={
        "host": "localhost",  # or your IP or Zilliz endpoint
        "port": "19530",
    },
    collection_name="attention_docs"
)

# 4. Prepare the LLM and prompt
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}
""")

# 5. Create document chain and retrieval chain
document_chain = create_stuff_documents_chain(llm, prompt)
retriever = milvus_vectorstore.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# 6. Invoke with a query
response = retrieval_chain.invoke({"input": "Scaled Dot-Product Attention"})
print(response['answer'])


  embedding = OpenAIEmbeddings()


Scaled Dot-Product Attention is a type of attention mechanism used in the Transformer model. It involves computing the dot products of queries with keys, dividing them by the square root of the dimensionality, and applying a softmax function to obtain weights on the values. This mechanism is used to determine the compatibility between the query and key values, ultimately generating output values based on these computations. It is an efficient and space-saving method compared to other attention mechanisms, especially for larger values of dimensionality.
