### Enviroment

In [2]:
import os 
from dotenv import load_dotenv
load_dotenv()
os.environ["HFToken"] = os.getenv("HFToken")
os.environ["groq_api_key"] = os.getenv("groq_api_key")

### WebDocument

In [3]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader = WebBaseLoader(
    web_path="https://en.wikipedia.org/wiki/Attention_Is_All_You_Need",
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(class_="mw-body")
    ),
)

docs = loader.load()


  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


### Chunks 

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 40)
splitted_docs = splitter.split_documents(docs)

### Embedding Model

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model = "all-MiniLM-L6-v2")

### Vectorstores & Retriever


In [6]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(splitted_docs,embeddings)
retriever = vectorstore.as_retriever()

### Model 

In [7]:
from langchain_groq import ChatGroq
model = ChatGroq(model = "qwen/qwen3-32b")

# A) Normal Retriever 

### Prompt 

In [8]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages(
    [
        ("system","""
        "You are an accurate and concise AI assistant. "
        "You must answer strictly based on the given context. "
        "If the answer is not found in the context, say 'I don't know'. "
        "Do not include any reasoning steps, chain-of-thought, or analysis. "
        "Respond only with the final answer."
        {context}
         """),
        ("user","user question = {input}")
    ]
)

### Chain (context with autofill from retriever)

In [9]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains import create_retrieval_chain
model_with_context = create_stuff_documents_chain(model,prompt)
rag_chain = create_retrieval_chain(retriever,model_with_context)

### Chain Implementation 

In [10]:
response = rag_chain.invoke({"input" : "Scaled dot-product attention & self-attention"})
response['answer']

'<think>\nOkay, the user is asking about scaled dot-product attention and self-attention. Let me check the provided context.\n\nThe context mentions that the Transformer model uses scaled dot-product attention, which is defined by the formula involving Q, K, V matrices and a scaling factor based on the square root of d_k. It explains that this mechanism replaces RNNs or LSTMs, allowing parallel processing. The self-attention part comes from Q, K, V being from the same source, enabling the model to focus on different parts of the input.\n\nThe user wants a concise answer based on the context. I need to make sure I include the formula, the components (Q, K, V), the scaling factor, and how it allows parallelization. Also, mention that self-attention uses the same source for Q, K, V. Avoid any extra information not in the context. Keep it accurate and to the point.\n</think>\n\nScaled dot-product attention is defined as:  \n**Attention(Q, K, V) = softmax(QKᵀ / √d_k) × V**,  \nwhere **Q**, 

# B) History Aware Retriever

### Prompt for rewriting query (query that user will ask)

In [11]:
from langchain_core.prompts import MessagesPlaceholder
prompt_for_rewrite = ChatPromptTemplate.from_messages([
    ("system",
     """You are an intelligent assistant that rewrites the user's latest question 
     into a standalone search query.

     Your goal:
     - Understand the entire conversation history.
     - Resolve pronouns, references, and incomplete questions.
     - Produce a short, clean, context-aware query for retrieval.
     - Answer the question accurately provide only answer not your thinking content.
     - Only output the rewritten standalone query.
     """),
    MessagesPlaceholder("chat_history"),
    ("human", "User question: {input}"),
])

### History_Retrieval

In [12]:
from langchain_classic.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(model,retriever,prompt_for_rewrite) #LLM tumhare {input} question ko pichhli chat history ke base par smartly rewrite karke ek complete, standalone query bana deta hai taaki retriever usse sahi context ke saath search kar sake.

### Prompt for Chain 

In [13]:
prompt_for_chain = ChatPromptTemplate.from_messages([
    ("system",
     """You are an accurate assistant.
     Answer the user's question strictly using the context below."""),
    ("user", "{context}\nQuestion: {input}")
])


### Chain (context will autofill from history_retriever)

In [14]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains import create_retrieval_chain

model_with_context1 = create_stuff_documents_chain(model,prompt_for_chain)
rag_chain = create_retrieval_chain(history_aware_retriever,model_with_context1)

### Chain Implementation (with history)

In [15]:
chat_history = []
response = rag_chain.invoke({"input" : "why do we need this?", "chat_history" : chat_history})
print(response['answer'])

response1 = rag_chain.invoke({"input" : "what is Q K and V ", "chat_history" : chat_history})
chat_history.append(response1['answer'])

response2 = rag_chain.invoke({"input" : "why do we need this?", "chat_history" : chat_history})
print("***************************************************s",response2['answer'])

<think>
Okay, so the user is asking "why do we need this?" referring to the Transformer model introduced in the "Attention Is All You Need" paper. Let me start by recalling the context given. The paper was published in 2017 by Google researchers, and it introduced the Transformer architecture, which uses self-attention mechanisms instead of RNNs or LSTMs. The key points from the context are that Transformers allow better performance due to scaled dot-product attention and self-attention, and they're considered foundational for modern AI, especially large language models.

The user is probably asking why Transformers are necessary over previous models. I need to explain the limitations of earlier models like RNNs and LSTMs. RNNs process data sequentially, which is slow and makes it hard to handle long-range dependencies. LSTMs improved some aspects but still have issues with vanishing gradients and aren't as efficient. Transformers, on the other hand, use self-attention to process all p

# C) SessionWise Memory (extended version of history_aware)

### Session wise ChatHistory 

In [16]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}

def session_wise_history(session_id) -> BaseChatMessageHistory:
    if session_id not in store :
        store[session_id] = ChatMessageHistory()
    return store[session_id]

rag_chain_session = RunnableWithMessageHistory(rag_chain, session_wise_history, input_messages_key="input", history_messages_key="chat_history", output_messages_key="answer")

### Running session RAG application

In [17]:
configuration1 = {"configurable" : {"session_id" : "chat1"}}

rag_chain_session.invoke({"input":"tell me more about NormLayer"}, config= configuration1)["answer"]

            id = uuid7()
Future versions will require UUID v7.
  input_data = validator(cls_, input_data)


'<think>\nOkay, the user is asking about "NormLayer." Let me look through the provided context to see if there\'s any mention of it. Hmm, the context discusses modern transformers, RNNs, fast weight controllers, attention mechanisms, and LSTMs. There\'s a detailed explanation of the attention formula with Q, K, V matrices and scaling by sqrt(d_k). I see references to softmax, query, key, value matrices, and mentions of linear transformers and fast weight programmers. However, I don\'t see any specific mention of a "NormLayer." \n\nNormLayer could refer to a normalization layer, like Layer Normalization or Batch Normalization, which are commonly used in neural networks, including transformers. But since the provided context doesn\'t explicitly discuss normalization layers, I can\'t confirm details from the given text. The user might be expecting information on how normalization is applied in the context of transformers or the attention mechanism described. However, without explicit info