In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT')
os.environ['LANGSMITH_TRACING'] = 'true'

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='o1-mini')

llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000022C4231EFD0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000022C4288F590>, root_client=<openai.OpenAI object at 0x0000022C4270EF10>, root_async_client=<openai.AsyncOpenAI object at 0x0000022C427D80D0>, model_name='o1-mini', temperature=1.0, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [4]:
from langchain_community.document_loaders import ArxivLoader

loader = ArxivLoader(
  load_max_docs=3,
  query='Attention is all you need',
)

In [5]:
documents = loader.load()

In [6]:
documents

[Document(metadata={'Published': '2024-07-22', 'Title': "Attention Is All You Need But You Don't Need All Of It For Inference of Large Language Models", 'Authors': 'Georgy Tyukin, Gbetondji J-S Dovonon, Jean Kaddour, Pasquale Minervini', 'Summary': 'The inference demand for LLMs has skyrocketed in recent months, and serving\nmodels with low latencies remains challenging due to the quadratic input length\ncomplexity of the attention layers. In this work, we investigate the effect of\ndropping MLP and attention layers at inference time on the performance of\nLlama-v2 models. We find that dropping dreeper attention layers only marginally\ndecreases performance but leads to the best speedups alongside dropping entire\nlayers. For example, removing 33\\% of attention layers in a 13B Llama2 model\nresults in a 1.8\\% drop in average performance over the OpenLLM benchmark. We\nalso observe that skipping layers except the latter layers reduces performances\nfor more layers skipped, except for 

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=800)

splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x22c423307d0>

In [9]:
text_documents = splitter.split_documents(documents)

In [11]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

vector_store = FAISS.from_documents(documents, embeddings)

In [13]:
retriever = vector_store.as_retriever()

In [14]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
  input_variables=['context', 'question'],

  template="""Answer the following question based on the given context:
  <context>
  {context}
  </context>

  Question: {question},
  Answer:"""
  )

In [15]:
from langchain.chains.combine_documents import create_stuff_documents_chain

chain = create_stuff_documents_chain(
  llm=llm,
  prompt=prompt,
)


In [16]:
question = 'What is the attention mechanism in the transformer model?'

context = retriever.get_relevant_documents(question)

response = chain.invoke(
  {
    'context': context,
    'question': question
  }
)

print(response)

  context = retriever.get_relevant_documents(question)


In the Transformer model, the **attention mechanism**—specifically **self-attention**—is a core component that allows the model to weigh the importance of different parts of the input data dynamically. Here's a breakdown of how it works:

1. **Input Embeddings**: The model starts with a sequence of input embeddings represented as a matrix **H** of shape \( n \times d_h \), where \( n \) is the number of elements in the sequence (e.g., words in a sentence) and \( d_h \) is the dimensionality of each embedding.

2. **Linear Projections**:
   - **Queries (Q)**: \( Q = H W_Q \)
   - **Keys (K)**: \( K = H W_K \)
   - **Values (V)**: \( V = H W_V \)
   
   Here, \( W_Q \), \( W_K \), and \( W_V \) are learned projection matrices that transform the input embeddings into query, key, and value vectors, respectively.

3. **Scaled Dot-Product Attention**:
   - **Attention Scores**: Compute the raw attention scores by taking the dot product of queries and keys: \( Q K^T \).
   - **Scaling**: To m