# Image Embeddings

In [1]:
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')



In [3]:
image = Image.open("example.jpg")
inputs = feature_extractor(images=image, return_tensors='pt')

In [4]:
# generate embedings
with torch.no_grad():
    outputs = model(**inputs)
    image_embedding = outputs.last_hidden_state.mean(dim=1) # average pooling

print(image_embedding.shape)
# print(image_embedding)

torch.Size([1, 768])


# Audio Embedding

In [5]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

In [6]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
audio_file = "example.wav"
audio, rate = librosa.load(audio_file, sr=16000)
inputs = feature_extractor(audio, return_tensors="pt", sampling_rate=rate)

In [8]:
with torch.no_grad():
    outputs = model(**inputs)
    audio_embedding = outputs.last_hidden_state.mean(dim=1) # average pooling

print(audio_embedding.shape)
# print(audio_embedding)

torch.Size([1, 768])


# Text Embedding

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch

In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
text = "This is an example sentence for embedding"
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

In [13]:
with torch.no_grad():
    outputs = model(**inputs)
    text_embedding = outputs.last_hidden_state.mean(dim=1)  #average pooling

print(text_embedding.shape)
# print(text_embedding)

torch.Size([1, 768])


# Multi Modal Embedding

In [14]:
combined_embedding = torch.cat([
    image_embedding,
    audio_embedding,
    text_embedding],
    dim=1)

print(combined_embedding.shape)

torch.Size([1, 2304])


# Store Data in Vector DB

In [15]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.schema import Document

In [16]:
embedding_model = HuggingFaceBgeEmbeddings(model_name="all-MiniLm-L6-v2")

  embedding_model = HuggingFaceBgeEmbeddings(model_name="all-MiniLm-L6-v2")
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [18]:
chroma_db = Chroma(
    persist_directory='./chroma_text_db',
    embedding_function=embedding_model)

In [19]:
documents = [
    Document(page_content="Elon Musk leads Tesla and SpaceX.", metadata={"source":"1"}),
    Document(page_content="Tesla's mission is to accelerate the world's transition to sustainable energy.", metadata={"source":"2"}),
    Document(page_content="SpaceX aims to make space travel accessible to humanity.", metadata={"source":"3"})
]

In [20]:
chroma_db.add_documents(documents)

['2e688f70-f001-4cf9-84d1-7af826e92278',
 'be10df48-0edd-47b2-ad16-40b32cca9673',
 '594d514d-8876-4dee-8589-ff19ab9bc136']

In [21]:
query_text = "What is Tesla's mission?"
results = chroma_db.similarity_search(query=query_text, k=2)

In [22]:
for idx, result in enumerate(results, 1):
    print(f"Result {idx}:")
    print(f"Content {result.page_content}:")
    print(f"Metadata {result.metadata}:")

Result 1:
Content Tesla's mission is to accelerate the world's transition to sustainable energy.:
Metadata {'source': '2'}:
Result 2:
Content Elon Musk leads Tesla and SpaceX.:
Metadata {'source': '1'}:


# Vector Search

In [23]:
movie_data =[
    {"title": "Inception", "description": "A mind-bending thriller where dream invasion is possible.", "genre":"Sci-Fi"},
    {"title": "The Matrix", "description": "A hacker discovers the truth about his reality.", "genre":"Sci-Fi"},
    {"title": "Titanic", "description": "A tragic love story set aboard a doomed ocean liner.", "genre":"Romance"},
    {"title": "The Godfather", "description": "The saga of a crime family and its legacy.", "genre":"Crime"},
    {"title": "Interstellar", "description": "A space epic exploring love, survival, and time.", "genre":"Sci-Fi"},
]

In [None]:
from langchain.schema import Document

documents = [Document(page_content=movie['description'], metadata={"title":movie['title'], 'genre':movie['genre']}) for movie in movie_data]

In [25]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

vector_store = Chroma.from_documents(documents, embedding_model)

  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [26]:
def vector_search(query, top_k=3):
    results = vector_store.similarity_search(query, k=top_k)
    return [
        {"title": res.metadata['title'],
         "description": res.page_content,
         "genre": res.metadata['genre']}

        for res in results]

In [None]:
def search_by_genre(query, genre):
    all_results = vector_store.similarity_search(query=query, k=10)
    filtered_results = [res for res in all_results if res.metadata['genre']==genre]
    return filtered_results

In [29]:
query1 = "A story about dreams and reality."
vector_search(query1)

[{'title': 'Inception',
  'description': 'A mind-bending thriller where dream invasion is possible.',
  'genre': 'Sci-Fi'},
 {'title': 'Interstellar',
  'description': 'A space epic exploring love, survival, and time.',
  'genre': 'Sci-Fi'},
 {'title': 'The Matrix',
  'description': 'A hacker discovers the truth about his reality.',
  'genre': 'Sci-Fi'}]

In [34]:
query1 = "A story about dreams and reality."
search_by_genre(query1, 'Sci-Fi')

Number of requested results 10 is greater than number of elements in index 5, updating n_results = 5


[Document(metadata={'genre': 'Sci-Fi', 'title': 'Inception'}, page_content='A mind-bending thriller where dream invasion is possible.'),
 Document(metadata={'genre': 'Sci-Fi', 'title': 'Interstellar'}, page_content='A space epic exploring love, survival, and time.'),
 Document(metadata={'genre': 'Sci-Fi', 'title': 'The Matrix'}, page_content='A hacker discovers the truth about his reality.')]

# Semantic Search

In [35]:
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.schema import Document

In [36]:
movie_data =[
    {"title": "Inception", "description": "A mind-bending thriller where dream invasion is possible.", "genre":"Sci-Fi"},
    {"title": "The Matrix", "description": "A hacker discovers the truth about his reality.", "genre":"Sci-Fi"},
    {"title": "Titanic", "description": "A tragic love story set aboard a doomed ocean liner.", "genre":"Romance"},
    {"title": "The Godfather", "description": "The saga of a crime family and its legacy.", "genre":"Crime"},
    {"title": "Interstellar", "description": "A space epic exploring love, survival, and time.", "genre":"Sci-Fi"},
]

In [37]:
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = HuggingFaceBgeEmbeddings(model_name=embedding_model_name)

In [38]:
documents = [Document(page_content=movie['description'], metadata={"title":movie['title']}) for movie in movie_data]

In [39]:
vector_store = Chroma.from_documents(documents, embedding_model)

In [40]:
llm_model_name = 'google/flan-t5-large'
hf_pipeline = pipeline("text2text-generation", model=llm_model_name, device=0)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [41]:
retriever = vector_store.as_retriever()

In [42]:
def custom_prompt(query):
    prompt = f"""
    You are a helpful assistant with retrieving movie titles based on descriptions.
    Query: {query}
    From the following dataset, only provide movie titles that match:
    Dataset:
    {", ".join([doc.metadata['title'] for doc in documents])}
    Response:
    """
    return prompt

In [43]:
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [44]:
semantic_query = "Find me movies about astronauts struggling to survive in space."
custom_query = custom_prompt(semantic_query)
semantic_results = qa_chain.run(custom_query)

  semantic_results = qa_chain.run(custom_query)


In [45]:
print(semantic_results)

Titanic
