###### Imports

In [1]:
import os
from dotenv import load_dotenv
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate,PromptTemplate
from langchain_core.runnables import (
    RunnablePassthrough
)
from langchain_core.messages import HumanMessage,AIMessage
from langchain_core.output_parsers import StrOutputParser

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader,PyMuPDFLoader
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [4]:
load_dotenv()

True

#### Data Ingestion and Processing

In [5]:
sample_documents = [
    Document(
        page_content="""
        Artificial Intelligence (AI) is the simulation of human intelligence in machines.
        These systems are designed to think like humans and mimic their actions.
        AI can be categorized into narrow AI and general AI.
        """,
        metadata={"source": "AI Introduction", "page": 1, "topic": "AI"}
    ),
    Document(
        page_content="""
        Machine Learning is a subset of AI that enables systems to learn from data.
        Instead of being explicitly programmed, ML algorithms find patterns in data.
        Common types include supervised, unsupervised, and reinforcement learning.
        """,
        metadata={"source": "ML Basics", "page": 1, "topic": "ML"}
    ),
    Document(
        page_content="""
        Deep Learning is a subset of machine learning based on artificial neural networks.
        It uses multiple layers to progressively extract higher-level features from raw input.
        Deep learning has revolutionized computer vision, NLP, and speech recognition.
        """,
        metadata={"source": "Deep Learning", "page": 1, "topic": "DL"}
    ),
    Document(
        page_content="""
        Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
        It combines computational linguistics with machine learning and deep learning models.
        Applications include chatbots, translation, sentiment analysis, and text summarization.
        """,
        metadata={"source": "NLP Overview", "page": 1, "topic": "NLP"}
    )
]

print(sample_documents)

[Document(metadata={'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}, page_content='\n        Artificial Intelligence (AI) is the simulation of human intelligence in machines.\n        These systems are designed to think like humans and mimic their actions.\n        AI can be categorized into narrow AI and general AI.\n        '), Document(metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='\n        Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.\n        '), Document(metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='\n        Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolu

##### Text Splitter

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = len,
    separators=[' ']
)

chunks = text_splitter.split_documents(sample_documents)
print(f"Create {len(chunks)} chunks from {len(sample_documents)} documents")

Create 4 chunks from 4 documents


#### Embeddings

In [10]:
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [13]:
embedding_model = OpenAIEmbeddings(
    model='text-embedding-3-small'
)

sample_text = "What is machine learning"
sample_embedding = embedding_model.embed_query(sample_text)

In [14]:
sample_embedding

[-0.0059221177361905575,
 -0.005889697000384331,
 0.0005751235294155777,
 -0.033544257283210754,
 0.021192103624343872,
 0.02213229238986969,
 -0.0007868689717724919,
 0.00929923728108406,
 -0.022564563900232315,
 0.037823744118213654,
 0.015183531679213047,
 -0.03676467761397362,
 -0.03397652879357338,
 0.0016048074467107654,
 0.026735983788967133,
 0.016015654429793358,
 0.0008530605118721724,
 -0.030864175409078598,
 0.023385880514979362,
 0.004903578199446201,
 3.242035018047318e-05,
 0.032550033181905746,
 0.042276136577129364,
 -0.029307996854186058,
 0.015961619094014168,
 -0.02092193439602852,
 0.0266279149800539,
 0.01239538099616766,
 0.00744047062471509,
 -0.006019378546625376,
 -0.009423515759408474,
 -0.029027020558714867,
 -0.03127483278512955,
 0.02755729854106903,
 0.04173579812049866,
 0.00043159592314623296,
 -0.0080564571544528,
 -0.010633875615894794,
 -0.055503640323877335,
 0.0025855230633169413,
 -0.05632495880126953,
 -0.016264209523797035,
 0.03028060868382454,

In [15]:
def compare_embeddings(text1:str,text2:str):
    emb1 = np.array(embedding_model.embed_query(text1))
    emb2 = np.array(embedding_model.embed_query(text2))

    similarity = np.dot(emb1,emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

    return similarity


In [16]:
print('Semantic Similarity Example')
print(f"'AI' vs 'Artificial Intelligence': {compare_embeddings('AI','Artificial Intelligence'):.3f}")

Semantic Similarity Example
'AI' vs 'Artificial Intelligence': 0.563


In [17]:
print(f"'man' vs 'woman': {compare_embeddings('man','woman'):.3f}")

'man' vs 'woman': 0.708


##### Vector Store

In [19]:
vector_store = FAISS.from_documents(
    documents=chunks,
    embedding=embedding_model
)
print(f'Vectore store created with {vector_store.index.ntotal} vectors')

Vectore store created with 4 vectors


In [20]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x7f81b3483b60>

In [21]:
vector_store.save_local('faiss_index')

In [22]:
loaded_vectorstore = FAISS.load_local('faiss_index',
                                      embedding_model,
                                      allow_dangerous_deserialization=True)

In [23]:
query = "What is Deep Learning"

result = vector_store.similarity_search(query,k=3)
print(result)

[Document(id='5040e015-f015-45dd-9991-6020791d1595', metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recognition.'), Document(id='8ef7e8b6-86ff-4a51-a059-a2702a452bac', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'), Document(id='cd0ac791-8820-4137-9247-12f626a897f8', metadata={'source': 'NLP Overview', 'page': 1, 'topic': 'NLP'}, page_content='Natural Language Processing (NLP) is a branch of AI that helps computers understand human lang

In [25]:
print(f"Query: {query}")
print("Top 3 similar chunks:")
for i,doc in enumerate(result):
    print(f'{i+1}, Source: {doc.metadata['source']}')
    print(f'Content: {doc.page_content[:200]}...')

Query: What is Deep Learning
Top 3 similar chunks:
1, Source: Deep Learning
Content: Deep Learning is a subset of machine learning based on artificial neural networks.
        It uses multiple layers to progressively extract higher-level features from raw input.
        Deep learning ...
2, Source: ML Basics
Content: Machine Learning is a subset of AI that enables systems to learn from data.
        Instead of being explicitly programmed, ML algorithms find patterns in data.
        Common types include supervised...
3, Source: NLP Overview
Content: Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
        It combines computational linguistics with machine learning and deep learning models.
      ...


In [30]:
### similarity search with scores
result_with_score = vector_store.similarity_search_with_score(query,k=3)

print(f"Query: {query}")
print("Top 3 similar chunks with scores:")
for doc,score in result_with_score:
    print(f'Score:{score:.3f}')
    print(f'Source: {doc.metadata['source']}')
    print(f'Content: {doc.page_content[:200]}...')

Query: What is Deep Learning
Top 3 similar chunks with scores:
Score:0.509
Source: Deep Learning
Content: Deep Learning is a subset of machine learning based on artificial neural networks.
        It uses multiple layers to progressively extract higher-level features from raw input.
        Deep learning ...
Score:1.141
Source: ML Basics
Content: Machine Learning is a subset of AI that enables systems to learn from data.
        Instead of being explicitly programmed, ML algorithms find patterns in data.
        Common types include supervised...
Score:1.236
Source: NLP Overview
Content: Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
        It combines computational linguistics with machine learning and deep learning models.
      ...


In [32]:
### search with metadata filtering
filter_dict = {'topic':'ML'}
filtered_results = vector_store.similarity_search(
    query,
    k=3,
    filter=filter_dict
)
print(filtered_results)
print(len(filtered_results))

[Document(id='8ef7e8b6-86ff-4a51-a059-a2702a452bac', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.')]
1


##### RAG chain with LCEL

In [50]:
from langchain.chat_models import init_chat_model

os.environ['GROQ_API_KEY']=os.getenv("GROQ_API_KEY")

llm = init_chat_model(model="groq:gemma2-9b-it")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7f81b03734d0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7f81b0372710>, model_name='gemma2-9b-it', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [51]:
simple_prompt = ChatPromptTemplate.from_template("""Answer the question based only on the following context:
                                                 Context: {context}

                                                 Question: {question}
                                                 
                                                 Answer:""")

In [52]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={"k":3}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f81b3483b60>, search_kwargs={'k': 3})

In [53]:
from typing import List
def format_docs(docs: List[Document]) -> str:
    formatted = []
    for i,doc in enumerate(docs):
        source = doc.metadata.get('source','unknown')
        formatted.append(f"Document {i+1} (Source: {source}):\n{doc.page_content}")
    return "\n\n".join(formatted)

In [64]:
simple_rag_chain =(
    {"context":retriever | format_docs,"question":RunnablePassthrough() }
    | simple_prompt | llm | StrOutputParser()
)

In [55]:
simple_rag_chain

{
  context: VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f81b3483b60>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n                                                 Context: {context}\n\n                                                 Question: {question}\n\n                                                 Answer:'), additional_kwargs={})])
| ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7f81b03734d0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7f81b0372710>, model_name='gemma2-9b-it', mo

###### Conversational RAG Chain

In [56]:
conversational_prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful AI assistant. Use the provided context to answer the questions."),
    ("placeholder","{chat_history}"),
    ("human","Context:{context}\n\nQuestion:{input}"),
])

In [57]:
def create_conversational_rag():
    return (
        RunnablePassthrough.assign(
            context = lambda x: format_docs(retriever.invoke(x['input']))
        )
        | conversational_prompt | llm | StrOutputParser()
    )

conversational_rag = create_conversational_rag()

In [58]:
conversational_rag

RunnableAssign(mapper={
  context: RunnableLambda(lambda x: format_docs(retriever.invoke(x['input'])))
})
| ChatPromptTemplate(input_variables=['context', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag=

##### Streaming RAG chain

In [59]:
streaming_rag_chain = (
    {"context":retriever | format_docs,"question":RunnablePassthrough()}
    | simple_prompt | llm
)

In [66]:
def test_rag_chain(question):
    print(f"Question:{question}")
    print("-"*30)

    print("\n1.Simple RAG Chain:")
    answer = simple_rag_chain.invoke(question)
    print(f"Answer:{answer}")

    print("\n2. Streaming RAG:")
    print("Answer: ",end="",flush=True)
    for chunk in streaming_rag_chain.stream(question):
        print(chunk.content,end="",flush=True)
    print()
    
    

In [67]:
test_rag_chain("What is the difference between AI and machine learning?")

Question:What is the difference between AI and machine learning?
------------------------------

1.Simple RAG Chain:
Answer:AI is the broad concept of machines simulating human intelligence, while machine learning is a subset of AI that focuses on enabling systems to learn from data without explicit programming.  


2. Streaming RAG:
Answer: AI is the broad concept of machines simulating human intelligence, while machine learning is a subset of AI that focuses on enabling systems to learn from data without explicit programming. 



In [68]:
test_questions = [
    "What is the difference between AI and Machine Learning",
    "Explain deep learning in simple term",
    "How does NLP work?"
]

for question in test_questions:
    print("\n" + "-"*30 + "\n" )
    test_rag_chain(question)


------------------------------

Question:What is the difference between AI and Machine Learning
------------------------------

1.Simple RAG Chain:
Answer:AI is the broader concept of machines performing tasks that typically require human intelligence. Machine Learning, on the other hand, is a specific subset of AI where systems learn from data rather than being explicitly programmed.  


2. Streaming RAG:
Answer: AI is the broader concept of machines performing human-like tasks, while Machine Learning is a subset of AI that focuses on enabling systems to learn from data rather than being explicitly programmed.  


------------------------------

Question:Explain deep learning in simple term
------------------------------

1.Simple RAG Chain:
Answer:Deep learning is like teaching a computer to see, hear, and understand language by showing it tons of examples.  

It uses artificial networks with many layers, kind of like a brain, to learn complex patterns from the data.  This lets comp

In [71]:
### Conversational example
print("\n3.Conversational RAG Example")
chat_history = []
q1 = "What is machine learning?"
a1 = conversational_rag.invoke({
    "input":q1,
    "chat_history":chat_history
})

print(f"Q1: {q1}")
print(f"A1:{a1}")


3.Conversational RAG Example
Q1: What is machine learning?
A1:Machine learning is a subset of Artificial Intelligence that allows systems to learn from data instead of being explicitly programmed.  ML algorithms find patterns in data to make predictions or decisions. 



In [72]:
chat_history.extend([
    HumanMessage(content=q1),
    AIMessage(content=a1)
])

In [73]:
### follow-up question
q2 = "How is it different from traditional programming?"
a2 = conversational_rag.invoke({
    "input":q2,
    "chat_history":chat_history
})
print(f"\nQ2: {q2}")
print(f"A2: {a2}")


Q2: How is it different from traditional programming?
A2: Here's how machine learning differs from traditional programming, drawing from the provided context:

* **Traditional Programming:**  You write explicit instructions (code) that tell the computer exactly what to do step-by-step.  The computer follows these rules rigidly.

* **Machine Learning:** You provide the computer with data and let it learn the patterns and rules from that data. The algorithm figures out the instructions itself,  without being explicitly programmed.

**Analogy:**

Think of teaching a child to identify a cat.

* **Traditional Programming:** You'd write a program with rules like: "If it has four legs, a tail, and fur, it's a cat."
* **Machine Learning:** You show the child many pictures of cats and other animals. The child learns to recognize the common features of cats based on the examples.


Let me know if you have any other questions!

