In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [5]:
documents = ['What is the captial of USA?', "Who is the President of USA?", 'Who is the prime minister of India?']

In [6]:
my_query = 'Narendra Modi is the Prime Minister of India'

In [7]:
document_embeddings = embeddings.embed_documents(documents)

In [8]:
document_embeddings

[[0.04512860253453255,
  -0.02247508242726326,
  -0.04122825711965561,
  -0.014120355248451233,
  -0.07973093539476395,
  0.024367500096559525,
  0.027888286858797073,
  -0.04686017334461212,
  0.001504570827819407,
  0.021463459357619286,
  -0.006095570512115955,
  -0.0552329383790493,
  0.008810142055153847,
  -0.013135109096765518,
  -0.05852409452199936,
  -0.0633220300078392,
  -0.006779501214623451,
  -0.027674945071339607,
  0.07579901069402695,
  -0.01031936053186655,
  0.08004873991012573,
  0.056561101227998734,
  0.05143629387021065,
  -0.02375621162354946,
  -0.014375762082636356,
  0.0024322601966559887,
  -0.01218230277299881,
  0.025410136207938194,
  0.04718494042754173,
  -0.08190310746431351,
  -0.042138997465372086,
  0.0006446812185458839,
  0.09544165432453156,
  0.010904848575592041,
  -0.006026749964803457,
  -0.004494121763855219,
  0.13083000481128693,
  -0.012148306705057621,
  0.06551742553710938,
  -0.007887870073318481,
  -0.002252258127555251,
  -0.0653078

In [9]:
query_embeddings = embeddings.embed_query(my_query)

In [10]:
query_embeddings

[-0.024652225896716118,
 0.06448184698820114,
 -0.015243330039083958,
 -0.008853137493133545,
 0.049786970019340515,
 -0.047787249088287354,
 0.07992971688508987,
 -0.011070577427744865,
 -0.02630140446126461,
 0.0025293168146163225,
 0.015217956155538559,
 -0.049890484660863876,
 -0.0014494946226477623,
 0.03467869386076927,
 0.04812585934996605,
 0.017833847552537918,
 -0.0016865141224116087,
 0.004727127030491829,
 -0.03536828234791756,
 -0.08200481534004211,
 0.05238301679491997,
 0.09946000576019287,
 0.015678783878684044,
 -0.018165603280067444,
 0.010221652686595917,
 -0.024437854066491127,
 -0.03731080889701843,
 -0.03832275792956352,
 -0.013971819542348385,
 -0.008656114339828491,
 0.04228418692946434,
 0.01743965782225132,
 -0.07628323882818222,
 -0.027769701555371284,
 -0.023593764752149582,
 0.012215846218168736,
 -0.09990193694829941,
 0.06312437355518341,
 0.0985054224729538,
 -0.06997865438461304,
 0.05495534837245941,
 -0.04872513934969902,
 0.004754565190523863,
 -0.06

In [12]:
cosine_similarity([query_embeddings], document_embeddings)

array([[0.17402038, 0.31278143, 0.73996883]])

In [14]:
euclidean_distances([query_embeddings], document_embeddings)

array([[1.28528559, 1.17236385, 0.72115345]])

In [16]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [17]:
index = faiss.IndexFlatL2(384)

In [18]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001AC27FD54A0> >

In [21]:
vector_store = FAISS(
    embedding_function = embeddings,
    index = index,
    docstore = InMemoryDocstore(),
    index_to_docstore_id = {},
)

In [22]:
vector_store.add_texts({"AI is the future", "AI is powerful", "Dogs are cute"})

['9f02bffc-412b-4a69-b623-78b7b17bea11',
 'ab751eff-40cf-481e-89bf-3abe20384841',
 'baa9e16b-fdec-4190-8fac-3eb81e4ec8db']

In [23]:
vector_store.index_to_docstore_id

{0: '9f02bffc-412b-4a69-b623-78b7b17bea11',
 1: 'ab751eff-40cf-481e-89bf-3abe20384841',
 2: 'baa9e16b-fdec-4190-8fac-3eb81e4ec8db'}

In [24]:
results = vector_store.similarity_search('Tell me about AI', k=2)

In [25]:
results

[Document(id='9f02bffc-412b-4a69-b623-78b7b17bea11', metadata={}, page_content='AI is the future'),
 Document(id='baa9e16b-fdec-4190-8fac-3eb81e4ec8db', metadata={}, page_content='AI is powerful')]

In [30]:
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [31]:
index=faiss.IndexFlatIP(384)
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [34]:
vector_store.add_documents(documents=documents)

['412b1a2d-addc-4563-a2b3-13ab7ff94fbc',
 '40f1c80b-f271-4153-8b9c-7fc63e2a3714',
 'c07d36cb-a07f-4be4-9100-897b8100df7c',
 '7e4463d2-ecc8-46c0-b814-3b3daeef4847',
 '22ffde6a-3723-4bed-8c23-fe9e7d376a66',
 'b09310b3-dbed-4598-be72-bfc46faae09c',
 '978ce811-7dd7-4e19-afbe-af97f902c4f0',
 '587b281a-dd84-4ee2-af92-25c8a732560c',
 '5ebd1bd6-094b-4a96-a42a-b5a4de1d2eba',
 'b412036c-27c2-452b-9ad9-a547cae0c85d']

In [35]:
result=vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":"news"}
    
)

In [36]:
result

[Document(id='7e4463d2-ecc8-46c0-b814-3b3daeef4847', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(id='40f1c80b-f271-4153-8b9c-7fc63e2a3714', metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(id='5ebd1bd6-094b-4a96-a42a-b5a4de1d2eba', metadata={'source': 'news'}, page_content='The stock market is down 500 points today due to fears of a recession.')]

In [37]:
retriever=vector_store.as_retriever(search_kwargs={"k": 3})

In [38]:
retriever.invoke("LangChain provides abstractions to make working with LLMs easy")

[Document(id='c07d36cb-a07f-4be4-9100-897b8100df7c', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='587b281a-dd84-4ee2-af92-25c8a732560c', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='b412036c-27c2-452b-9ad9-a547cae0c85d', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [39]:
vector_store.save_local("faiss index")

In [40]:
new_vector_store=FAISS.load_local(
  "faiss index",embeddings ,allow_dangerous_deserialization=True
)

In [41]:
new_vector_store.similarity_search("langchain")

[Document(id='c07d36cb-a07f-4be4-9100-897b8100df7c', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='587b281a-dd84-4ee2-af92-25c8a732560c', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='22ffde6a-3723-4bed-8c23-fe9e7d376a66', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(id='978ce811-7dd7-4e19-afbe-af97f902c4f0', metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.')]

In [42]:
from langchain_community.document_loaders import PyPDFLoader

In [43]:
FILE_PATH=r"C:\Study\AgenticAI\2 - Langchain Basics\2.1 - Data Ingestion\syllabus.pdf"

In [44]:
loader=PyPDFLoader(FILE_PATH)

In [45]:
len(loader.load())

34

In [46]:
pages=loader.load()

In [47]:
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [48]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [49]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,#hyperparameter
    chunk_overlap=50 #hyperparemeter
)

In [50]:
split_docs = splitter.split_documents(pages)

In [51]:
len(split_docs)

81

In [52]:
index=faiss.IndexFlatIP(384)
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [53]:
vector_store.add_documents(documents=split_docs)

['f74ac0d9-e1d4-4f5a-83d7-7e1b61f5a653',
 '73ce7147-a960-4a3f-b9c1-e54b5a7f0db3',
 '0b08ca04-cf1f-4a39-b7a3-293fb197066c',
 '2144a346-6736-4b97-a2dd-2d1ed3f164fd',
 '1dbbd8d0-2980-43e6-9395-beb5786e3d68',
 '39ad861a-712d-43dd-bedd-f82a9f76bbef',
 'f280ea79-09c3-4ac2-961a-662f404a044c',
 '8c281006-d456-4542-af71-526a06748355',
 'ce10bdde-ce53-4c92-9f23-6d12c5c48f10',
 '321a8f43-1f0e-41cb-82f9-87a9ee1b9ba2',
 'e1815d58-ca11-4d97-b007-1af5d5ee834a',
 'c36572e1-ecfa-4980-880d-4bf8665a0f0f',
 '17a947f9-ceaf-4614-babd-9d3f6e80ff01',
 'cfe3cc3e-ac76-4ad2-8643-8d50c361f854',
 '6c42aa53-0813-4488-bbb1-1573cb4c3dea',
 'bf4f3b57-242b-41b8-bbc5-4c867de6606d',
 '2d45d921-a981-42bb-a35d-903a32eadf6d',
 '888daed2-7584-49c8-9f93-1e04f5df0acd',
 '309f1ac5-1806-49bd-b2f3-36ba6a4aa551',
 '369a73fe-de5c-43d4-a996-ab2f66b694ff',
 '29604ca6-c192-4d27-8ecc-6040e964d663',
 '9265af3a-7d83-4981-85e0-bc07385c8f28',
 '35638ef9-c8e6-48d6-acdd-a9c6772a13a4',
 '752ee5d0-4da6-494e-83a7-a64da219447b',
 '70af6a03-3944-

In [54]:
retriever=vector_store.as_retriever(
    search_kwargs={"k": 10} #hyperparameter
)

In [55]:
retriever.invoke("what is langchain?")

[Document(id='a531f5ef-208b-4292-b6eb-345f59166dab', metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal singh', 'containsaigeneratedcontent': 'Yes', 'source': 'C:\\Study\\AgenticAI\\2 - Langchain Basics\\2.1 - Data Ingestion\\syllabus.pdf', 'total_pages': 34, 'page': 31, 'page_label': '32'}, page_content='various use cases.\nIntroduction to Retrieval-Augmented\nGeneration (RAG)\nTopics\nOverview of Retrieval-Augmented\nGeneration (RAG)\nWhat is RAG?, Key Components of a\nRAG System, Why RAG is Important for\nAdvanced AI Systems\nUnderstanding the End-to-End RAG\nPipeline\nOverview of the RAG Workflow, Data\nRetrieval, Contextualization, and\nGeneration Phases, Challenges and\nOpportunities in RAG\nIntegrating LangChain in RAG Introduction to LangChain Framework,\nBuilding End-to-End RAG Pipeline