# **Embedding Techniques**

In [8]:
import os
from dotenv import load_dotenv
load_dotenv()
from rich import print

In [9]:
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')
os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')

# 1. Google Embeddings

In [10]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
print(embeddings)

In [11]:
vector = embeddings.embed_query("This is Google GenAI embeddings")
print(vector[:5])  # just first 5 vectors

In [12]:
len(vector)

768

In [13]:
embeddings_512 = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")  # another model

In [14]:
vector_512 = embeddings_512.embed_query("This is Google GenAI embeddings", output_dimensionality=512)
print(vector[:5])  # just first 5 vectors

In [15]:
len(vector_512) # reduced dimensions

512

## Converting document chunks to embeddings, storing them in Vectore DB and retrieving the chunks based on a Query

In [16]:
from langchain_community.document_loaders import TextLoader
# load the text file
loader = TextLoader("Data Transformation Simplified Expl.txt")
text_document = loader.load()
text_document

[Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="Data Transformation: Simplified Explanation\n\nWhat is Data Transformation?\nData transformation is like changing the way we measure things to make them easier to understand and compare. It's like converting inches to centimeters or changing dollars to euros to help with calculations and comparisons.\n\nWhy and Where?\nData transformation is important because it helps us work with data more effectively. We transform data to make it more useful for analysis and building models.\n\nPractical Uses:\n- Scaling: Changing the range or size of numbers.\n- Normalization: Making data follow a certain pattern or shape.\n\nReal-Life Example:\nThink about temperatures. In some places, they use Celsius, and in others, Fahrenheit. If you want to compare temperatures accurately, you might convert everything to one scale, like Celsius.\n\nMin-Max Scaling: Simplified Explanation\n\nStandard Scaling is like making al

In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# split the document into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=60)
doc_chunks = splitter.split_documents(text_document)
doc_chunks

[Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content="Data Transformation: Simplified Explanation\n\nWhat is Data Transformation?\nData transformation is like changing the way we measure things to make them easier to understand and compare. It's like converting inches to centimeters or changing dollars to euros to help with calculations and comparisons."),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Why and Where?\nData transformation is important because it helps us work with data more effectively. We transform data to make it more useful for analysis and building models.'),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Practical Uses:\n- Scaling: Changing the range or size of numbers.\n- Normalization: Making data follow a certain pattern or shape.'),
 Document(metadata={'source': 'Data Transformation Simplified Expl.txt'}, page_content='Real-Life Example:\nThink ab

## Chroma DB

In [18]:
from langchain_community.vectorstores import Chroma
# Chroma vectorestore
chroma_vectorstore = Chroma.from_documents(doc_chunks, embedding=embeddings, persist_directory = "./chroma_db")
chroma_vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x1f16b0d0350>

In [19]:
# using similarity search to retrieve relevant chunks from vectorstore
query = "What is Data Transformation?"
retrieved_chunks = chroma_vectorstore.similarity_search(query=query, k=3)
print(retrieved_chunks)

In [20]:
# using similarity search with relevance score to retrieve relevant chunks
retrieved_chunks = chroma_vectorstore.similarity_search_with_relevance_scores(query=query, k=2)
print(retrieved_chunks)

### Deleting all embeddings from ChromaDB

In [21]:
ids = chroma_vectorstore.get(where={'source': 'Data Transformation Simplified Expl.txt'})['ids']

In [22]:
# deleting all ids
chroma_vectorstore.delete(ids=ids)

In [23]:
# Now verify is all embeddins have been deleted
chroma_vectorstore.get(where={'source': 'Data Transformation Simplified Expl.txt'})['ids']

[]

## FAISS DB

In [24]:
# Creating a vectorstore that will store our document chunks in the form of embeddings
from langchain_community.vectorstores import FAISS
# FAISS vectorestore

faiss_vectorstore = FAISS.from_documents(doc_chunks, embedding=embeddings)
faiss_vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1f16b0d0250>

In [25]:
# using similarity search to retrieve relevant chunks from vectorstore
query = "What is Data Transformation?"
retrieved_chunks = faiss_vectorstore.similarity_search(query=query, k=2)
print(retrieved_chunks)

In [26]:
# using similarity search with score to retrieve relevant chunks
retrieved_chunks = faiss_vectorstore.similarity_search_with_score(query=query, k=2)
print(retrieved_chunks)

In [27]:
# using similarity search with relevance score to retrieve relevant chunks
retrieved_chunks = faiss_vectorstore.similarity_search_with_relevance_scores(query=query, k=2)
print(retrieved_chunks)

### Deleting all embeddings from FAISS

In [28]:
ids = (faiss_vectorstore.index_to_docstore_id).values()

In [29]:
faiss_vectorstore.delete(ids=ids)

True

In [30]:
# Now verify is all embeddins have been deleted
faiss_vectorstore.index_to_docstore_id

{}

# 2. Cohere Embeddings

In [31]:
from langchain_cohere import CohereEmbeddings

# 3. Huggingface Embeddings

In [34]:
from langchain_huggingface import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print(hf_embeddings)

In [35]:
# Converting a sentence to embeddings
hf_vector = hf_embeddings.embed_query("I'm using Huggingface embeddings")
print(len(hf_vector))
print(hf_vector[:4])

In [41]:
# converting documents to embeddings
hf_docs_vector = hf_embeddings.embed_documents([
    "Hello, how are you",
    "I'm well. What about you?"
])

print("First sentence length:", len(hf_docs_vector[0]))
print("Second sentence length:", len(hf_docs_vector[1]))

print(hf_docs_vector[1][:5]) # second sentence embeddings