In [6]:
import os

import warnings
warnings.filterwarnings('ignore')

from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage

from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, PyPDFLoader
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from dotenv import load_dotenv

load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')


### 1. Data Ingestion

In [2]:
sample_documents = [
    Document(
        page_content="""
        Artificial Intelligence (AI) is the simulation of human intelligence in machines.
        These systems are designed to think like humans and mimic their actions.
        AI can be categorized into narrow AI and general AI.
        """,
        metadata={"source": "AI Introduction", "page": 1, "topic": "AI"},
    ),
    Document(
        page_content="""
        Machine Learning is a subset of AI that enables systems to learn from data.
        Instead of being explicitly programmed, ML algorithms find patterns in data.
        Common types include supervised, unsupervised, and reinforcement learning.
        """,
        metadata={"source": "ML Basics", "page": 1, "topic": "ML"},
    ),
    Document(
        page_content="""
        Deep Learning is a subset of machine learning based on artificial neural networks.
        It uses multiple layers to progressively extract higher-level features from raw input.
        Deep learning has revolutionized computer vision, NLP, and speech recognition.
        """,
        metadata={"source": "Deep Learning", "page": 1, "topic": "DL"},
    ),
    Document(
        page_content="""
        Natural Language Processing (NLP) is a branch of AI that helps computers understand human language.
        It combines computational linguistics with machine learning and deep learning models.
        Applications include chatbots, translation, sentiment analysis, and text summarization.
        """,
        metadata={"source": "NLP Overview", "page": 1, "topic": "NLP"},
    ),
]

sample_documents

[Document(metadata={'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}, page_content='\n        Artificial Intelligence (AI) is the simulation of human intelligence in machines.\n        These systems are designed to think like humans and mimic their actions.\n        AI can be categorized into narrow AI and general AI.\n        '),
 Document(metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='\n        Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.\n        '),
 Document(metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='\n        Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revo

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=[" "]
)

chunks = text_splitter.split_documents(sample_documents)

In [5]:
chunks

[Document(metadata={'source': 'AI Introduction', 'page': 1, 'topic': 'AI'}, page_content='Artificial Intelligence (AI) is the simulation of human intelligence in machines.\n        These systems are designed to think like humans and mimic their actions.\n        AI can be categorized into narrow AI and general AI.'),
 Document(metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'),
 Document(metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recogn

### 2. Load the embedding model

In [10]:
embedding = OpenAIEmbeddings(model='text-embedding-3-large', dimensions=1536)

sample_text = "What is machin elearning"
sample_embedding = embedding.embed_query(sample_text)
len(sample_embedding)

1536

In [11]:
texts = ["AI", "ML", 'DL', 'NN']

batch_embeddings = embedding.embed_documents(texts)
len(batch_embeddings)

4

### Compare embeddings woth cosine similarity

In [13]:
import numpy as np

def compare_embeddings(text1: str, text2: str):
    emb1 = np.array(embedding.embed_query(text1))
    emb2 = np.array(embedding.embed_query(text2))

    similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

    return similarity

In [14]:
compare_embeddings("INDIA", "CHINA")

np.float64(0.585355339361695)

In [15]:
compare_embeddings("INDIA", "KENIA")

np.float64(0.4195405335116149)

In [16]:
compare_embeddings("ML", "DL")

np.float64(0.5378704409355349)

### Create FAISS Vector Store

In [18]:
vector_store = FAISS.from_documents(
    documents=chunks,
    embedding=embedding
)

print(f"vector store created with {vector_store.index.ntotal} vectors")

vector store created with 4 vectors


### Save the vectors in FAISS

In [None]:
vector_store.save_local(folder_path="FAISS_index")
print("vectore store saved at 'FAISS_index'")

In [20]:
loaded_vector_store = FAISS.load_local(folder_path="FAISS_index", 
                                       embeddings=embedding, 
                                       allow_dangerous_deserialization=True)

In [21]:
# let's perform similarity search

loaded_vector_store.similarity_search("what is deep learning")

[Document(id='6e25b2af-c376-44a0-896e-c2c75ed2c1cc', metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recognition.'),
 Document(id='547cbbd7-5b45-4f25-b7d7-39ed862bc301', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'),
 Document(id='1b65691a-5460-4f16-a9d8-d2b9c91683f3', metadata={'source': 'NLP Overview', 'page': 1, 'topic': 'NLP'}, page_content='Natural Language Processing (NLP) is a branch of AI that helps computers understand human la

In [None]:
# let's perform similarity score

loaded_vector_store.similarity_search_with_score("what is deep learning", k=2)

[(Document(id='6e25b2af-c376-44a0-896e-c2c75ed2c1cc', metadata={'source': 'Deep Learning', 'page': 1, 'topic': 'DL'}, page_content='Deep Learning is a subset of machine learning based on artificial neural networks.\n        It uses multiple layers to progressively extract higher-level features from raw input.\n        Deep learning has revolutionized computer vision, NLP, and speech recognition.'),
  np.float32(0.63186)),
 (Document(id='547cbbd7-5b45-4f25-b7d7-39ed862bc301', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.'),
  np.float32(1.0620673))]

In [27]:
# let's perform similarity with metadata filtering

filter = {"topic": "ML"}

filtered_result = loaded_vector_store.similarity_search(
    query="what is machine learning",
    k=2,
    filter=filter
    )
filtered_result

[Document(id='547cbbd7-5b45-4f25-b7d7-39ed862bc301', metadata={'source': 'ML Basics', 'page': 1, 'topic': 'ML'}, page_content='Machine Learning is a subset of AI that enables systems to learn from data.\n        Instead of being explicitly programmed, ML algorithms find patterns in data.\n        Common types include supervised, unsupervised, and reinforcement learning.')]

### RAG