In [2]:
import os
from dotenv import load_dotenv
from rich import print
load_dotenv()

os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')

### Loading the Webpage content

In [13]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader("https://python.langchain.com/v0.1/docs/expression_language/streaming/")
webpage_content = loader.load()

### Splitting into chunks

In [14]:
chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(webpage_content)
print(chunks[6])

### Storing chunks in vectorstore

In [15]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db = Chroma.from_documents(chunks, embeddings)
retriever = db.as_retriever()

retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002567269C4D0>)

### Simple RAG chain

In [21]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant")

system_prompt = (
    """You are a helpful assistant for Q/A tasks. Use the following pieces of
    retrieved content to answer the question. If you don't know the answer, just say
    that you don't know. Keep the answer concise.\n\n
    Context:
    {context}
    """
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

prompt

ChatPromptTemplate(input_variables=['context', 'input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are a helpful assistant for Q/A tasks. Use the following pieces of\n    retrieved content to answer the question. If you don't know the answer, just say\n    that you don't know. Keep the answer concise.\n\n\n    Context:\n    {context}\n    ")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])

In [22]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)

In [25]:
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002567269C4D0>), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | ChatPromptTemplate(input_variables=['context', 'input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are a helpful assistant for Q/A tasks. Use the following pieces of\n    retrieved content to answer the question. If you don't know the answer, just say\n    that you don't know. Keep the answer concise.\n\n\n    Context:\n    {context}\n    ")), HumanMessagePromptTemplate(prompt=PromptTemplate

In [24]:
response = rag_chain.invoke({"input": "What is Streaming?"})

response

{'input': 'What is Streaming?',
 'context': [Document(metadata={'description': 'Streaming is critical in making applications based on LLMs feel responsive to end-users.', 'language': 'en', 'source': 'https://python.langchain.com/v0.1/docs/expression_language/streaming/', 'title': 'Streaming | 🦜️🔗 LangChain'}, page_content='Streaming | 🦜️🔗 LangChain'),
  Document(metadata={'description': 'Streaming is critical in making applications based on LLMs feel responsive to end-users.', 'language': 'en', 'source': 'https://python.langchain.com/v0.1/docs/expression_language/streaming/', 'title': 'Streaming | 🦜️🔗 LangChain'}, page_content="to end-users.Important LangChain primitives like LLMs, parsers, prompts, retrievers, and agents implement the LangChain Runnable Interface.This interface provides two general approaches to stream content:sync stream and async astream: a default implementation of streaming that streams the final output from the chain.async astream_events and async astream_log: th

In [27]:
rag_chain.invoke({"input": "I couldn't understand it?"})

{'input': "I couldn't understand it?",
 'context': [Document(metadata={'description': 'Streaming is critical in making applications based on LLMs feel responsive to end-users.', 'language': 'en', 'source': 'https://python.langchain.com/v0.1/docs/expression_language/streaming/', 'title': 'Streaming | 🦜️🔗 LangChain'}, page_content='happens if we try to stream them? 🤨from langchain_community.vectorstores import FAISSfrom langchain_core.output_parsers import StrOutputParserfrom langchain_core.prompts import ChatPromptTemplatefrom langchain_core.runnables import RunnablePassthroughfrom langchain_openai import OpenAIEmbeddingstemplate = """Answer the question based only on the following context:{context}Question: {question}"""prompt = ChatPromptTemplate.from_template(template)vectorstore = FAISS.from_texts(    ["harrison worked at kensho", "harrison likes spicy food"],    embedding=OpenAIEmbeddings(),)retriever = vectorstore.as_retriever()chunks = [chunk for chunk in retriever.stream("where 

**It doesn't remeber the previous messages and chat history.  We'll add this feature now.**

### Adding Chat History

`1. History Aware Retriever`

In [28]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

condense_q_system_template = (
    """Given a chat history and the latest user question 
    which might reference context in the chat history, 
    formulate a standalone question which can be understood 
    without the chat history. Do NOT answer the question, 
    just reformulate it if needed and otherwise return it as is."""
)

condense_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_q_system_template),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, condense_q_prompt
)

history_aware_retriever

RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
| VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002567269C4D0>))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question \n    which might reference context in the chat history, \n    formulate a standalone question which can be understood \n    without the chat history. Do NOT answer the question

`2. Q/A Chain`

In [29]:
system_prompt = (
    """You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer
    the question. If you don't know the answer, say that you
    don't know. Keep the
    answer concise.\n\n

    {context}
    """
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
    ]
)
qa_chain = create_stuff_documents_chain(llm, qa_prompt)
qa_chain


RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), config={'run_name': 'format_inputs'})
| ChatPromptTemplate(input_variables=['context', 'input'], optional_variables=['chat_history'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, partial_variables={'chat_history': []}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks.\n    Use the following pieces of retrieved context to answer\n    the question. If you don't know the answer, say that you\n    don't know. Keep the\n    answer concise.\n\n\n\n    {context}\n    ")), MessagesPlaceholder(variable_name='chat_histo

`3. Retrieval Chain`

In [30]:
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableBranch(branches=[(RunnableLambda(lambda x: not x.get('chat_history', False)), RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002567269C4D0>))], default=ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question \n    which might reference context in the chat history, \n    formulate a standalone question which can be u

#### **Question No.1**

In [31]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []
question = "What is streaming?"

response = rag_chain.invoke(
    {
        "input" : question,
        "chat_history": chat_history
    }
)

chat_history.extend(
    [
        HumanMessage(content = question),
        AIMessage(content = response["answer"])
    ]
)

print(response["answer"])

#### **Question No.2 (Follow-up Question)**

In [32]:
question2= "Can you explain it more because I couldn't get what you said."

response = rag_chain.invoke(
    {
        "input" : question2,
        "chat_history": chat_history
    }
)

chat_history.extend(
    [
        HumanMessage(content = question2),
        AIMessage(content = response["answer"])
    ]
)
print(response["answer"])
print(chat_history)

#### **Question No.3 (Follow-up Question)**

In [33]:
question3= "Ok. Can you explain it now through code?"

response = rag_chain.invoke(
    {
        "input" : question3,
        "chat_history": chat_history
    }
)

chat_history.extend(
    [
        HumanMessage(content = question3),
        AIMessage(content = response["answer"])
    ]
)
print(response["answer"])
chat_history

[HumanMessage(content='What is streaming?'),
 AIMessage(content='Streaming is a way to process and yield output in chunks, rather than all at once, allowing for a more responsive and interactive experience, especially when working with large language models (LLMs) that can take a long time to generate a complete response.'),
 HumanMessage(content="Can you explain it more because I couldn't get what you said."),
 AIMessage(content="In simple terms, streaming is a way to break down a big task, like generating a long response from a language model, into smaller chunks, and send those chunks one by one, rather than all at once.\n\nThink of it like a chef cooking a meal. Instead of serving the whole meal at once, the chef serves each course one by one, so you can enjoy each part of the meal as it's ready. That's similar to how streaming works with language models."),
 HumanMessage(content='Ok. Can you explain it now through code?'),
 AIMessage(content='In the context of LangChain, streaming

### Combinig this RAG Chain with ChatMessageHistory

In [40]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}

def get_session_history(session_id:str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer"
    )

#### **First Session_id**

In [41]:
config = {"configurable": {"session_id": "ak216"}}

# first question of this session_id
conversational_rag_chain.invoke(
    {"input": "What is langchain streaming?"},
    config=config
)['answer']

'LangChain streaming refers to the ability to stream content in chunks from LangChain primitives, such as LLMs, parsers, prompts, retrievers, and agents, to end-users in a responsive manner. This is achieved through the LangChain Runnable Interface, which provides two approaches to streaming: sync stream and async astream.'

In [42]:
conversational_rag_chain.invoke(
    {"input": "How to use it?"},
    config=config
)['answer']

'To use LangChain streaming, primitives like LLMs, parsers, prompts, retrievers, and agents implement the LangChain Runnable Interface, which provides two approaches: sync stream and async astream. These methods allow streaming of final output in chunks. For async streaming, use async throughout the code, propagate callbacks, and call .astream() on LLMs to stream tokens.'

#### **Second Session_id**

In [43]:
config = {"configurable": {"session_id": "aasher123"}}

# first question of this new session_id
conversational_rag_chain.invoke(
    {"input": "Can you give me its code?"},  # It won't be able to answer as this is new id
    config=config
)['answer']

'You\'re referring to the code for propagating callbacks correctly. Here it is:\n\n```python\nfrom langchain.chains import Chain\nfrom langchain.chains.lambdas import Lambda\nfrom langchain.tools import Tool\n\ndef reverse_and_double(input_str: str):\n    return double_tool.invoke(reverse_word.invoke(input_str))\n\nreverse_word = Lambda(lambda x: x[::-1])\ndouble_tool = Tool(lambda x: x*2)\n\nreverse_and_double = Chain([reverse_word, double_tool])\n\nawait reverse_and_double.invoke("1234")\n\nasync for event in reverse_and_double.stream_events("1234", version="v1"):\n    print(event)\n```\n\nThis code creates a chain of two tools: `reverse_word` and `double_tool`. The `reverse_word` tool reverses the input string, and the `double_tool` tool doubles the input string. The `Chain` class is used to combine these tools into a single chain, and the `stream_events` method is used to generate stream events for the chain.'