# **Vectorstores**


In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from rich import print

os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')

## 1. FAISS - Facebook AI Similarity Search

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_cohere import CohereEmbeddings
from langchain_community.vectorstores import  FAISS

# load the document
loader = TextLoader("Data Transformation Simplified Expl.txt")
document = loader.load()
# split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=60)
doc_chunks = text_splitter.split_documents(document)

In [3]:
doc_chunks

[Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="Data Transformation: Simplified Explanation\n\nWhat is Data Transformation?\nData transformation is like changing the way we measure things to make them easier to understand and compare. It's like converting inches to centimeters or changing dollars to euros to help with calculations and comparisons."),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Why and Where?\nData transformation is important because it helps us work with data more effectively. We transform data to make it more useful for analysis and building models.'),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Practical Uses:\n- Scaling: Changing the range or size of numbers.\n- Normalization: Making data follow a certain pattern or shape.'),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Real-Life Example:\nThink ab

In [4]:
# we're using cohere embeddings
embeddings = CohereEmbeddings()
# create the vector store
vectorstore = FAISS.from_documents(doc_chunks, embeddings)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x23c729e0ad0>

#### Search Method

In [5]:
# searching a word from vectorstore
retrieved_docs = vectorstore.search("Scaling", search_type="similarity", k=2)
print(retrieved_docs)

#### Similarity Search Method

In [6]:
# using similarity search
retrieved_docs = vectorstore.similarity_search("What is Data Transformation?", k=3)
print(retrieved_docs[0].page_content)

#### Similarity Search Score Method

In [7]:
# using similarity search with score to retrieve relevant chunks. The lower the score, the more relevant. it uses L2 score 
# that is also called Manhatton Distance (it gives distance from query)
retrieved_docs = vectorstore.similarity_search_with_score("What is Data Transformation?", k=2)
print(retrieved_docs)

#### Similarity Search by Vector Method 
We can also pass a vector instead of text query to vectorstore.


In [8]:
vector_query = embeddings.embed_query("What is Data Transformation?")
vector_retrieved_docs = vectorstore.similarity_search_by_vector(vector_query, k=2)
print(vector_retrieved_docs)

### Saving Vectorstore in Local and then Loading it

In [9]:
vectorstore.save_local("faiss_index") # pickel file will be saved in current directory

In [10]:
# loading vectorstore
new_vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
new_vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x23c7b402c10>

In [11]:
# retrieving chunks from our loaded vectorstore
chunks = new_vectorstore.similarity_search_with_score("What is Min-Max Scaling?", k=2) 
print(chunks)

## Vectorstore as a Retriever

In [12]:
retriever = vectorstore.as_retriever()
retriever.invoke("What is Data Transformation?")

[Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="Data Transformation: Simplified Explanation\n\nWhat is Data Transformation?\nData transformation is like changing the way we measure things to make them easier to understand and compare. It's like converting inches to centimeters or changing dollars to euros to help with calculations and comparisons."),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Why and Where?\nData transformation is important because it helps us work with data more effectively. We transform data to make it more useful for analysis and building models.'),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="In summary, data transformation methods like Min-Max Scaling and Standard Scaling are like adjusting data to make it easier to work with and compare. They're useful for various situations where you want to ensure fair comparisons and prevent some data

## 2. Chroma

In [13]:
from langchain_community.vectorstores import Chroma

chroma_vectorstore = Chroma.from_documents(doc_chunks, embeddings, persist_directory="./chroma_db")
chroma_vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x23c7b40d250>

#### Similarity Search Method

In [14]:
retrieved_docs = chroma_vectorstore.similarity_search("What is Data Transformation?", k=2)
print(retrieved_docs)

#### Similarity Search Score Method

In [15]:
retrieved_docs = chroma_vectorstore.similarity_search_with_score("What is Data Transformation?", k=2)
print(retrieved_docs)

### Loading Chroma DB and using it

In [20]:
new_chroma_db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
new_chroma_db.similarity_search("What is Data Transformation?", k=2)

[Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="Data Transformation: Simplified Explanation\n\nWhat is Data Transformation?\nData transformation is like changing the way we measure things to make them easier to understand and compare. It's like converting inches to centimeters or changing dollars to euros to help with calculations and comparisons."),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Why and Where?\nData transformation is important because it helps us work with data more effectively. We transform data to make it more useful for analysis and building models.')]

#### Deleting ChromaDB from local

In [17]:
ids = chroma_vectorstore.get(where={'source': 'Data Transformation Simplified Expl.txt'})['ids']
ids

['06be8360-66f0-4edc-a940-b37b2c8026d7',
 '15317816-739e-46b5-9036-3b3b99a9f077',
 '1d135b76-7b46-4355-80c4-94438e2f646a',
 '241a44e7-6f09-4136-8ad8-5aaae33eb464',
 '3259f129-7d53-4fbd-989a-36a41e1a49b2',
 '938f43b9-6e04-4f01-b79b-cc039bd8fa47',
 'c4cfc39b-b405-4664-ad05-863e8d03dff4',
 'e5c21c8c-49c4-4287-bd76-07ff32777874',
 'eec10cf3-d384-4281-9242-722c31a5a034']

In [18]:
# deleting all ids
# chroma_vectorstore.delete(ids=ids)

## Vectorstore as a Retriever (Chroma)

In [21]:
retriever = new_chroma_db.as_retriever()
retriever.invoke("What is Min-Max Scaling?")

[Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="Min-Max Scaling: Simplified Explanation\n\nStandard Scaling is like making all your numbers play nicely with each other by giving them a common center (average) and making sure they're all about the same distance (standard deviation) from that center."),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Real-Life Example:\nThink about temperatures. In some places, they use Celsius, and in others, Fahrenheit. If you want to compare temperatures accurately, you might convert everything to one scale, like Celsius.\n\nMin-Max Scaling: Simplified Explanation'),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="In summary, data transformation methods like Min-Max Scaling and Standard Scaling are like adjusting data to make it easier to work with and compare. They're useful for various situations where you want to ensure fair compa