# _Vector Store_ #

### _FAISS_ ###

In [2]:
#Import libraries
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

In [27]:
#Load pdf
loader = PyPDFLoader(file_path='/mnt/c/Personal/learnlang/1706.03762v7.pdf')
pdf_doc = loader.load()
pdf_doc

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/mnt/c/Personal/learnlang/1706.03762v7.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz K

In [28]:
#Split Text
splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
split_doc = splitter.split_documents(pdf_doc)
len(split_doc)
print(split_doc[0].page_content)

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu


In [29]:
 #embedding
embedding = OllamaEmbeddings(model='mxbai-embed-large:335m')
db=FAISS.from_documents(split_doc,embedding)
db

<langchain_community.vectorstores.faiss.FAISS at 0x7f4f765baec0>

In [30]:
#Similarity search in db
query ='We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'
f_docs = db.similarity_search(query)
f_docs[0].page_content

'Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'

In [31]:
# As a retriever

retriever = db.as_retriever()
f_docs = retriever.invoke(query)
f_docs[0].page_content

'Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'

In [32]:
f_docs_with_score = db.similarity_search_with_score(query)
f_docs_with_score

[(Document(id='16f6286e-cc36-4717-a6a0-5dbd017c338d', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/mnt/c/Personal/learnlang/1706.03762v7.pdf', 'total_pages': 15, 'page': 3, 'page_label': '4'}, page_content='Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'),
  np.float32(81

In [33]:
#using Embedding for Query
embedding_vector= embedding.embed_query(query)
print(len(embedding_vector))
embedding_vector

1024


[0.02440115064382553,
 0.15963448584079742,
 -0.1952962577342987,
 -0.42458614706993103,
 -0.3379802107810974,
 -0.4507318139076233,
 -0.053389616310596466,
 -0.158680260181427,
 -0.34885072708129883,
 0.65086829662323,
 -0.1268281489610672,
 0.3605460524559021,
 -0.06592094898223877,
 0.11563671380281448,
 -0.15020713210105896,
 0.45072251558303833,
 -0.3353217840194702,
 -0.7297274470329285,
 -0.4103340804576874,
 0.008580312132835388,
 0.012504639104008675,
 0.3031439781188965,
 -0.877472460269928,
 -0.0018278583884239197,
 -0.45316165685653687,
 0.7175368070602417,
 -0.16331465542316437,
 0.12352657318115234,
 1.0435667037963867,
 0.8797747492790222,
 0.23479881882667542,
 -0.33096134662628174,
 -0.6343474984169006,
 -0.35113489627838135,
 -0.7091270089149475,
 -0.685415506362915,
 0.7139946222305298,
 -1.056103229522705,
 -0.252523273229599,
 -0.5784677267074585,
 0.26691409945487976,
 0.12073056399822235,
 0.14995330572128296,
 -0.3952682316303253,
 -1.0421907901763916,
 -0.01434

In [34]:
f_docs = db.similarity_search_by_vector(embedding_vector)
f_docs

[Document(id='16f6286e-cc36-4717-a6a0-5dbd017c338d', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/mnt/c/Personal/learnlang/1706.03762v7.pdf', 'total_pages': 15, 'page': 3, 'page_label': '4'}, page_content='Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'),
 Document(id='c2

In [35]:
##Saving and Loading

db.save_local('faiss_local')

In [36]:
new_db = FAISS.load_local('faiss_local',embedding,allow_dangerous_deserialization=True)
new_db

<langchain_community.vectorstores.faiss.FAISS at 0x7f4f768f7ee0>

In [37]:
f_docs = new_db.similarity_search(query)
f_docs

[Document(id='16f6286e-cc36-4717-a6a0-5dbd017c338d', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/mnt/c/Personal/learnlang/1706.03762v7.pdf', 'total_pages': 15, 'page': 3, 'page_label': '4'}, page_content='Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'),
 Document(id='c2

### _Chroma_ ###

In [38]:
from langchain_community.vectorstores import Chroma

In [39]:
vectordb = Chroma.from_documents(split_doc,embedding)
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x7f4f75cbbd00>

In [41]:
query = 'We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'
f_docs = vectordb.similarity_search(query)
f_docs[0].page_content

'Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'

In [42]:
#Saving to the disk
vectordb = Chroma.from_documents(split_doc,embedding,persist_directory='./chromadb')

In [44]:
#load from disc
chromadb2 = Chroma(persist_directory='./chromadb',embedding_function=embedding)
f_docs = chromadb2.similarity_search(query)
print(f_docs[0].page_content)
f_docs

Scaled Dot-Product Attention
 Multi-Head Attention
Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several
attention layers running in parallel.
of the values, where the weight assigned to each value is computed by a compatibility function of the
query with the corresponding key.
3.2.1 Scaled Dot-Product Attention
We call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of


[Document(metadata={'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'total_pages': 15, 'keywords': '', 'trapped': '/False', 'producer': 'pdfTeX-1.40.25', 'title': '', 'moddate': '2024-04-10T21:11:43+00:00', 'page_label': '4', 'page': 3, 'author': '', 'creator': 'LaTeX with hyperref', 'subject': '', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': '/mnt/c/Personal/learnlang/1706.03762v7.pdf'}, page_content='Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'),
 Document(metadata={'page_label': '4', 'title': '', 'author

In [47]:
#Retriever
retriever_Chroma = chromadb2.as_retriever()
retriever_Chroma.invoke(query)[0].page_content

'Scaled Dot-Product Attention\n Multi-Head Attention\nFigure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several\nattention layers running in parallel.\nof the values, where the weight assigned to each value is computed by a compatibility function of the\nquery with the corresponding key.\n3.2.1 Scaled Dot-Product Attention\nWe call our particular attention "Scaled Dot-Product Attention" (Figure 2). The input consists of'