Deep dive into Hybrid Search and Meta Data filtering

- Keyword Search + Semantic Search + Reciprocal Ranking
- Extract keywords from user question and perform search with metadata filtering
- Semantic Search with Metadata filtering with SelfQueryRetriever

In [1]:
import uuid

In [2]:
! pip install lark --q
! pip install langchain --q
! pip install langchain_core --q
! pip install chromadb --q
! pip install langchain_together --q
! pip install langchain-community --q
# ! pip install sentence_transformers --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.8/321.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.1/127.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

### Loading keys from Colab Secrets

In [3]:
from google.colab import userdata
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_KEY')

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

llm = ChatOpenAI(temperature=0)

### Source Documents with Metadata

In [4]:
from langchain_core.documents import Document

docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

### Chromdb collections

In [5]:
import chromadb
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True),)
client.reset()

True

In [6]:
import chromadb.utils.embedding_functions as embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.environ["OPENAI_API_KEY"])

In [7]:
collection = client.create_collection(
    name="movies",
    metadata={"hnsw:space": "cosine"},
    embedding_function=openai_ef
)

In [8]:
for doc in docs:
    collection.add(
        documents=[doc.page_content],
        ids=[str(uuid.uuid1())],
        metadatas=[doc.metadata]
    )

# Search with Manual metadata extraction from user query using LLM Call

In [9]:
from pydantic import BaseModel, Field
from typing import Optional


class Metadata(BaseModel):
    year: Optional[int] = Field(description="extract year from the question")
    director: Optional[str] = Field(description="extract movie director from the question")
    genre: Optional[str] = Field(description="extract movie genre from the question")
    rating: Optional[float] = Field(description="extract movie rating from the question")

In [10]:
from langchain.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=Metadata)

In [11]:
from langchain_core.prompts import ChatPromptTemplate


template = """
you only need to extract entities and nothing else.
Based on the given question extract entities like movie year,director,genre,rating.
Extract entity only if the there is a mention of the same in the question.

Question: {question}

{format_instructions}

"""
prompt = ChatPromptTemplate.from_template(template,
                                          partial_variables={"format_instructions" : parser.get_format_instructions()})

chain = prompt | llm | parser

In [12]:
question = 'Give me details of the movie from 2010 with rating more than 8.0'

In [13]:
output = chain.invoke(question)

In [14]:
output

Metadata(year=2010, director=None, genre=None, rating=8.0)

### filter from extracted metadata

In [15]:
filter = []
for i in output:
    if i[1]:
        if i[0]=='rating':
            filter.append({i[0]:{"$gt":i[1]}})
        else:
            filter.append({i[0]:{"$eq":i[1]}})

In [16]:
filter

[{'year': {'$eq': 2010}}, {'rating': {'$gt': 8.0}}]

Search with filter

In [17]:
results = collection.query(
    query_texts=[question],
    n_results=2,
    where={"$and": filter}
)

In [18]:
results['documents'][0]

['Leo DiCaprio gets lost in a dream within a dream within a dream within a ...']

In [19]:
client.delete_collection(name="movies")
client.reset()
client.list_collections()

[]

# Semantic Search with Metadata filtering with SelfQueryRetriever

Load documents into Vector DB

In [20]:
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

### Creating Chunks using RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0,
    length_function=len
)
new_docs = text_splitter.split_documents(documents=docs)


### Creating Retriever using Vector DB

db = Chroma.from_documents(new_docs, OpenAIEmbeddings())

### Metadata attribute info

In [21]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    ),
]

document_content_description = "Brief summary of a movie"

### Structured Query Constructor with LLM

In [22]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

In [23]:
query_constructor.invoke(question)

StructuredQuery(query=' ', filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2010), Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.0)]), limit=None)

### Translate the query according to the vectorDB

In [24]:
from langchain.retrievers.self_query.chroma import ChromaTranslator

retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=db,
    structured_query_translator=ChromaTranslator()
)

In [25]:
question

'Give me details of the movie from 2010 with rating more than 8.0'

In [26]:
retriever.invoke(question)

[Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'director': 'Christopher Nolan', 'rating': 8.2, 'year': 2010})]

In [27]:
retriever.invoke(
    "What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated"
)

[Document(page_content='Toys come alive and have a blast doing so', metadata={'genre': 'animated', 'year': 1995})]