In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
import numpy as np
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
from transformers import AutoProcessor, AutoModel
import torch
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from typing import Dict, List
# from src_ubc.classes.survey import Preferences
from classes.survey import Preferences

  from .autonotebook import tqdm as notebook_tqdm


## Create Serverless Index

In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY_UBC"))

In [7]:
# print(pc.list_indexes())

## Initalize Dense DB

In [3]:
# create dense index with integrated embedding 
index_name = "museum-ai-dense"
if not pc.has_index(index_name):
    pc.create_index(index_name, 
                    dimension=512, 
                    metric="cosine", 
                    vector_type="dense",
                    spec=ServerlessSpec( # index deployed region
                        cloud="aws",
                        region="us-east-1"
                    ),
                )
    
dense_db = pc.Index(index_name)

## Initalize Sparse DB

In [4]:
# create sparse index with integrated embedding
index_name = "museum-ai-sparse"
if not pc.has_index(index_name):
    pc.create_index(index_name, 
                    # dimension=512, 
                    metric="dotproduct", 
                    vector_type="sparse",
                    spec=ServerlessSpec(
                        cloud="aws",
                        region="us-east-1"
                    ),
                )

sparse_db = pc.Index(index_name)

## Search Functions

In [5]:
MODEL_NAME = "BAAI/BGE-VL-base" # or "BAAI/BGE-VL-large"

device = "cuda" if torch.cuda.is_available() else "cpu"
dense_model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device) # You must set trust_remote_code=True
dense_model_processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
dense_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [6]:
from PIL import Image

def get_text_embeddings(texts: list[str]) -> list:
    if not texts:
        return []
    inputs = dense_model_processor(text=texts, return_tensors="pt", truncation=True, padding=True).to(device)
    return dense_model.get_text_features(**inputs).cpu()

def get_image_embeddings(image_paths: list[str]) -> list:
    if not image_paths:
        return []
    images = [Image.open(image_path).convert("RGB") for image_path in image_paths]
    inputs = dense_model_processor(images=images, return_tensors="pt").to(device)
    return dense_model.get_image_features(**inputs).cpu()

In [7]:
# Load model directly
sparse_model_tokenizer = AutoTokenizer.from_pretrained("naver/splade-v3")
sparse_model = AutoModelForMaskedLM.from_pretrained("naver/splade-v3").to(device)

In [8]:
def get_sparse_embeddings(texts: list[str]):
    if not texts:
        return []
    tokens = sparse_model_tokenizer(texts, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = sparse_model(**tokens)
    sparse_embedding = torch.max(torch.log(1 + torch.relu(outputs.logits)) * tokens.attention_mask.unsqueeze(-1), dim=1)[0].detach().cpu()
    
    # convert to pinecone sparse format
    res = []
    for i in range(len(sparse_embedding)):
        indices = sparse_embedding[i].nonzero().squeeze().tolist()
        values = sparse_embedding[i, indices].tolist()
        res.append({"indices": indices, "values": values})
    return res

In [9]:
def merge_chunks(h1, h2, h3):
    """Get the unique hits from two search results and return them as single array of {'_id', 'chunk_text'} dicts, printing each dict on a new line."""
    # Deduplicate by _id
    deduped_hits = {hit['id']: hit for hit in h1['matches'] + h2['matches']}.values()
    # Sort by _score descending
    sorted_hits = sorted(deduped_hits, key=lambda x: x['score'], reverse=True)
    # Transform to format for reranking
    # print(sorted_hits)
    return sorted_hits
    # result = [{'id': hit['id'], 'chunk_text': hit['metadata']['content']} for hit in sorted_hits]
    # return result

def search_with_text(query: str, tags: Preferences):
    dense_results = dense_db.query(
        namespace="umag",
        top_k=5,
        include_values=True,
        include_metadata=True,
        vector=get_text_embeddings([query])[0].tolist()
    ).matches

    sparse_results = sparse_db.query(
        namespace="umag",
        top_k=5,
        include_values=True,
        include_metadata=True,
        sparse_vector=get_sparse_embeddings([query])[0]
    ).matches
    
    tag_results = search_tags(tags)

    merged_results = rrf_merge(dense_results, sparse_results, tag_results)

    return merged_results


def search_with_image(image_path: str):
    dense_results = dense_db.query(
        namespace="umag",
        top_k=20,
        include_values=False,
        include_metadata=True,
        vector=get_image_embeddings([image_path])[0].tolist()
    )

    # sparse_results = sparse_db.query(
    #     namespace="umag",
    #     top_k=5,
    #     include_values=True,
    #     include_metadata=True,
    #     sparse_vector=get_sparse_vector(get_sparse_embeddings([query])[0])
    # )

    # merged_results = merge_chunks(sparse_results, dense_results)

    return dense_results

In [11]:
def get_number_tag_matches(tags: Preferences, hit: Dict) -> float:
    """
    Get the number of tag matches based on how many of the user's tags match the hit's metadata.
    
    Args:
        tags: A Preferences object with attributes:
            - time_period: List[str]
            - themes: List[str]
            - exhibits: List[str]
            - art_medium: List[str]
            - additional_interests: List[str]
        hit: A dictionary representing a search result hit.
        
    Returns:
        A float score representing the number of matching tags.
    """
    score = 0
    for key in tags.__dict__.keys():
        if key in hit['metadata']:
            matches = set(hit['metadata'][key]) & set(getattr(tags, key))
            score += len(matches)
    return score

In [12]:
def search_tags(tags: Preferences, top_k=5):
    """
    Perform a metadata search based on the provided Preferences object.
    
    Args:
        tags: A Preferences object with attributes:
            - time_period: List[str]
            - themes: List[str]
            - exhibits: List[str]
            - art_medium: List[str]
            - additional_interests: List[str]
        top_k: Number of results to return.
        
    Returns:
        List of matching documents from dense_db.
    """
    # Convert the Preferences object to a dictionary.
    tag_dict = {
        "time_period": tags.time_period,
        "themes": tags.themes,
        # "exhibits": tags.exhibits,
        "art_medium": tags.art_medium,
        "additional_interests": tags.additional_interests
    }

    # Remove any keys with empty lists.
    tag_dict = {k: v for k, v in tag_dict.items() if v}
    print(tag_dict)

    # Dummy vector for metadata filtering (using the dimensions expected by your dense index)
    dummy_vector = np.zeros(512).tolist()

 # Build metadata filter conditions. Each condition looks for documents where a given field contains at least one of the values.
    filter_conditions = []
    for key, values in tag_dict.items():
        filter_conditions.append({key: {"$in": values}})
    
    # Use $or operator so that if any condition matches the document is returned
    metadata_filter = {"$or": filter_conditions} if filter_conditions else {}

    # Perform the semantic search using the dummy vector and only filter by metadata.
    response = dense_db.query(
        namespace="umag",
        vector=dummy_vector,
        top_k=top_k,
        include_metadata=True,
        filter=metadata_filter
    )
    
    for i in range(len(response.matches)):
        # normalize the score based on the number of preferences
        response.matches[i]['score'] = get_number_tag_matches(tags, response.matches[i]) / tags.count_preferences()

    return response.matches

In [14]:
tags = Preferences(
    time_period=["ming"],
    themes=["symbolism"],
    exhibits=[],
    art_medium=["ceramics"],  # fixed typo: "cermanics" → "ceramics"
    additional_interests=["Sculpture"]
)

print(tags.count_preferences())  # Output should be 3

test_matches = search_tags(tags)
# test_matches

3
{'time_period': ['ming'], 'themes': ['symbolism'], 'art_medium': ['ceramics'], 'additional_interests': ['Sculpture']}


In [12]:
def calculate_rrf(dense_matches, sparse_matches, tag_matches, k=60):
    """
    Calculate the Reciprocal Rank Fusion (RRF) score for a list of matches.
    
    Args:
        dense_matches: List of dense search results.
        sparse_matches: List of sparse search results.
        tag_matches: List of tag search results.
        k: The k value for RRF calculation.
        
    Returns:
        A list of matches with RRF scores.
    """
    rrf_scores = {}
    
    for match in dense_matches + sparse_matches + tag_matches:
        doc_id = match['id']
        if doc_id not in rrf_scores:
            rrf_scores[doc_id] = 0
        rrf_scores[doc_id] += 1 / (k + match['score'])
    
    sorted_rrf = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    return [{'id': doc_id, 'rrf_score': score} for doc_id, score in sorted_rrf]

### Insert Data

In [20]:
import json 
with open("output/Objectifying_China/tagged/en_contents_chunked_sample2_embeddings.json", "r", encoding="utf-8") as f:
        data = json.load(f)

In [10]:

import json
import itertools

def setup_dense_db(file_path, namespace="umag"):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    vectors = []
    # count = 0
    for section_id, chunk in data.items():
        # create vector for text chunks
            vectors.append({
                "id": chunk["id"],
                "values": chunk["dense_text_embedding"],
                "metadata": {
                    "content": chunk["text"],
                    "type": "text",
                    "page_idx": [str(i) for i in chunk["page_idx"]], # convert to string for db
                    "header": chunk["header"],
                    "exhibit": chunk["exhibit"],
                    "time_period": chunk['time_period'],
                    "materiality": chunk["materiality"],
                    "region": chunk["region"],
                    "colour": chunk["colour"],
                    "purpose": chunk["purpose"],
                    "themes": chunk['themes'],
                }
            })
            
            # SKIPPING BECAUSE WE DON'T NEED IMAGES YET 
            # # create vector for each image in the chunk
            # if "dense_image_embeddings" in chunk:
            #     for i in range(len(chunk["dense_image_embeddings"])):
            #         vectors.append({
            #         "id": chunk["id"],
            #         "values": chunk["dense_image_embeddings"][i],
            #         "metadata": {
            #             "type": "image",
            #             "page_idx": [str(i) for i in chunk["page_idx"]], # TODO: FIX PAGE IDX SO IT MATCHES THE ORIGINAL PAGE IT WAS ON
            #             "imge_path": chunk["img_path"][i]
            #         }
            # })

    def chunks(iterable, batch_size=200):
        """A helper function to break an iterable into chunks of size batch_size."""
        it = iter(iterable)
        chunk = tuple(itertools.islice(it, batch_size))
        while chunk:
            yield chunk
            chunk = tuple(itertools.islice(it, batch_size))

    for chunk in chunks(vectors, batch_size=200):
        dense_db.upsert(
            vectors=chunk,
            namespace=namespace
        )

    print(f"Inserted {len(vectors)} vectors into the dense index.")

In [11]:
# # delete existing data
# dense_db.delete(delete_all=True, namespace="umag")

{}

In [12]:
setup_dense_db("../output/Objectifying_China/embeddings/en_contents_doc_chunked_embeddings.json")

Inserted 81 vectors into the dense index.


In [88]:
dense_db.describe_index_stats(namespace="umag")

file_path = "../output/Objectifying_China/tagged/en_contents_doc_chunked.json"

import json

def count_unique_ids_from_nested_json(filepath):
    """
    Reads a JSON file where the top-level keys are IDs and returns the number of unique IDs.
    
    Args:
        filepath (str): Path to the JSON file.
        
    Returns:
        int: Number of unique IDs.
    """
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    top_level_ids = set(data.keys())
    
    # Optional: Verify if inner "id" matches the key
    mismatched = []
    for key, value in data.items():
        if value.get("id") != key:
            mismatched.append((key, value.get("id")))
    
    if mismatched:
        print(f"⚠️ Warning: {len(mismatched)} mismatched IDs between key and value:")
        for k, v in mismatched[:5]:  # show first 5 mismatches
            print(f"  Key: {k}  |  Value.id: {v}")
    
    print(f"Total top-level entries: {len(data)}")
    print(f"Unique IDs (by keys): {len(top_level_ids)}")
    return len(top_level_ids)


count_unique_ids_from_nested_json(file_path)


Total top-level entries: 82
Unique IDs (by keys): 82


82

In [67]:
query = "I'm interested in wucai/five_colour, sancai/three_colour  artifacts, from the ming period, originating in , used for artistic expression, with themes of . I also like dragons."
dense_results = dense_db.query(
        namespace="umag",
        top_k=5,
        include_values=True,
        include_metadata=True,
        vector=get_text_embeddings([query])[0].tolist()
    )

In [80]:
# dense_results

for item in dense_results.matches:
    if item['id'] == 'sdfsdf':
        print(item['id'])

In [73]:
dense_results.matches

[{'id': 'lnwpgxpl',
  'metadata': {'colour': [],
               'content': 'Ming and Qing Dynasty Ceramics and Their Stylistic '
                          'Influences Abroad',
               'exhibit': 'Objectifying China',
               'header': 'Objectifying China',
               'materiality': ['porcelain'],
               'page_idx': ['3'],
               'purpose': ['export', 'decoration'],
               'region': ['jingdezhen_kilns'],
               'themes': ['technique'],
               'time_period': ['ming', 'qing'],
               'type': 'text'},
  'score': 0.609256744,
  'values': [0.196492299,
             -0.114293158,
             -0.402429461,
             0.108975641,
             0.329515517,
             -0.349267781,
             -0.00400025398,
             0.126044333,
             0.0401671678,
             0.384126246,
             0.203765944,
             -0.177383631,
             0.165675014,
             0.221005231,
             0.385517359,
         

In [None]:

def setup_sparse_db(file_path, namespace="umag"):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    vectors = []
    for section_id, chunk in data.items():
        # create vector for text chunks
        vectors.append({
            "id": chunk["id"],
            "sparse_values": chunk["sparse_text_embedding"],
            "metadata": {
                "content": chunk["text"],
                "type": "text",
                "page_idx": [str(i) for i in chunk["page_idx"]], # convert to string for db
                "header": chunk["header"],
                "exhibit": chunk["exhibit"],
                "time_period": chunk['time_period'],
                "materiality": chunk["materiality"],
                "region": chunk["region"],
                "colour": chunk["colour"],
                "purpose": chunk["purpose"],
                "themes": chunk['themes'],
                }
            })
        
    def chunks(iterable, batch_size=200):
        """A helper function to break an iterable into chunks of size batch_size."""
        it = iter(iterable)
        chunk = tuple(itertools.islice(it, batch_size))
        while chunk:
            yield chunk
            chunk = tuple(itertools.islice(it, batch_size))


    for chunk in chunks(vectors, batch_size=200):
        sparse_db.upsert(
            vectors=chunk,
            namespace="umag"
        )

    
    print(f"Inserted {len(vectors)} vectors into the sparse index.")
    sparse_db.describe_index_stats(namespace="umag")

In [38]:
# delete existing vectors
# sparse_db.delete(delete_all=True, namespace="umag")

{}

In [22]:
import json

file_path = "output/Objectifying_China/tagged/en_contents_doc_chunked.json"

# Load the JSON data
with open(file_path, "r") as f:
    data = json.load(f)

# Add 'exhibit' field to each item
for item in data.values():
    item['exhibit'] = "Objectifying China"

# Save the updated data back to the same file
with open(file_path, "w") as f:
    json.dump(data, f, indent=4)


In [49]:
# setup_sparse_db("output/Objectifying_China/embeddings/en_contents_doc_chunked_embeddings.json")
query = "tell me about Jiajing emperor"
sparse_results = sparse_db.query(
    namespace="umag",
    top_k=5,
    include_values=False,
    include_metadata=True,
    sparse_vector=get_sparse_embeddings([query])[0]
)

In [52]:
sparse_results.matches.append(sparse_results.matches[0])  # Duplicate the first result for testing

In [55]:
final_results = []
for results in sparse_results.matches:
    result_dict = {}
    result_dict['id'] = results['id']
    result_dict['text'] = results['metadata']['content']
    final_results.append(result_dict)
final_results

[{'id': 'xazdoep6',
  'text': 'This immense ceramic is a jardinière, made as a decorative display or water container for the gardens of a Chinese villa. The main pattern depicts awkward-looking twin lions with sharp claws playing with ornamental balls. A prominent inscription at the centre reads: Da Ming Jiajing Nian Zhi— ‘Manufactured in the Reign of Jiajing of the Great Ming Dynasty’. Because of their weight, large objects like this were particularly difficult to manufacture, often warping during the firing process. By some accounts, they required up to nine days to fire, with frequent misfires. The almost purple colour of the decoration is the result of mixing imported cobalt blue with local cobalt from Ruizhou, called shiziqing. A signature colour of Jiajing porcelains, it was generally only used on jardinières reserved for the imperial court, noble families or high-ranking officials. Auspicious patterns like the twin lions sporting ornamental balls were popular during the reign of

In [45]:
result = search_with_text('vessels')
result

TypeError: search_with_text() missing 1 required positional argument: 'tags'

## Retriever Functions

In [42]:
from collections import defaultdict

def rrf_merge(sparse_results, dense_results, tag_results, k=60):
    def rank_dict(results):
        return {doc['id']: rank for rank, doc in enumerate(results)}

    # Create rank mappings
    sparse_ranks = rank_dict(sparse_results)
    dense_ranks = rank_dict(dense_results)
    tag_ranks = rank_dict(tag_results)

    # Create lookup for originals
    id_to_doc = {}
    for result_set in [sparse_results, dense_results, tag_results]:
        for doc in result_set:
            if doc['id'] not in id_to_doc:
                id_to_doc[doc['id']] = doc

    merged = {}

    # Merge all IDs
    all_ids = set(sparse_ranks) | set(dense_ranks) | set(tag_ranks)
    for id_ in all_ids:
        sparse_rank = sparse_ranks.get(id_)
        dense_rank = dense_ranks.get(id_)
        tag_rank = tag_ranks.get(id_)

        sparse_score = 1 / (k + sparse_rank) if sparse_rank is not None else 0
        dense_score = 1 / (k + dense_rank) if dense_rank is not None else 0
        tag_score = 1 / (k + tag_rank) if tag_rank is not None else 0

        final_score = sparse_score + dense_score + tag_score

        base_doc = id_to_doc[id_]
        merged[id_] = {
            'id': id_,
            'metadata': base_doc['metadata'],
            'sparse_score': sparse_score,
            'dense_score': dense_score,
            'tag_score': tag_score,
            'final_score': final_score,
        }

    return sorted(merged.values(), key=lambda x: x['final_score'], reverse=True)


In [None]:
query = "tell me about the Jiajing emperor"
tags = Preferences(
    time_period=[],
    themes=["symbolism"],
    exhibits=[],
    art_medium=["ceramics"],  # fixed typo: "cermanics" → "ceramics"
    additional_interests=["Sculpture"]
)

sparse_results = sparse_db.query(
    namespace="umag",
    top_k=5,
    include_values=False,
    include_metadata=True,
    sparse_vector=get_sparse_embeddings([query])[0]
)

dense_results = dense_db.query(
    namespace="umag",
    top_k=5,
    include_values=True,
    include_metadata=True,
    vector=get_text_embeddings([query])[0].tolist()
)

tag_results = search_tags(tags)
tag_results

In [44]:
dense_results.matches

NameError: name 'dense_results' is not defined

In [None]:
rrf_merge(dense_results.matches, sparse_results.matches, tag_results)

In [21]:
enumerate(dense_results.matches)

<enumerate at 0x170b0aca0>

In [18]:
from src_ubc.classes.survey import SurveyResponse, Preferences

response = SurveyResponse(
        major="Computer Science",
        age_group="18-24",
        class_subject="Introduction to AI",
        exhibits=["Exhibit A", "Exhibit B"],
        tour_length_minutes=60,
        time_period=["Modern Art", "Contemporary Art"],
        art_medium=["Digital", "cermanics"],  # fixed typo: "cermanics" → "ceramics"
        themes=["Innovation", "Technology"],
        additional_interests=["Interested in AI applications in art."],
        additional_notes="Looking forward to the tour!"
    )

preferences = Preferences(
        exhibits=response.exhibits,
        time_period=response.time_period,
        art_medium=response.art_medium,
        themes=response.themes,
        additional_interests=response.additional_interests
    )
 
# query = generate_human_query(preferences)

In [15]:
preferences

Preferences(time_period=['Modern Art', 'Contemporary Art'], art_medium=['Digital', 'Sculpture'], themes=['Innovation', 'Technology'], additional_interests=['Interested in AI applications in art.'])

In [19]:
tag_results = search_tags(preferences)


{'time_period': ['Modern Art', 'Contemporary Art'], 'themes': ['Innovation', 'Technology'], 'art_medium': ['Digital', 'cermanics'], 'additional_interests': ['Interested in AI applications in art.']}


In [20]:
tag_results

[{'id': 'j1m6v08t',
  'metadata': {'art_medium': ['cermanics'],
               'content': 'With large flaring mouths, vessels of this shape '
                          'resemble inverted bells and are commonly referred to '
                          'as ‘chime-type cups’. Popular throughout the Ming '
                          'and Qing dynasties, this pair was made during the '
                          'reign of the Jiajing emperor. In keeping with the '
                          "emperor's interest in auspicious symbolism, the "
                          'three goats and suns surrounded by prunus, pine and '
                          'bamboo trees are another example of the use of '
                          'homophones in the Chinese language to create visual '
                          'puns. The goats and suns (both pronounced yang) form '
                          'a rebus that refers to a passage from the I Ching '
                          '(‘The Book of Changes’): “the first 

In [26]:
response = SurveyResponse(
    major="Computer Science",
    age_group="18-24",
    class_subject="Introduction to AI",
    exhibits=["Exhibit A", "Exhibit B"],
    tour_length_minutes=60,
    time_period=["qing"],
    art_medium=["cermanics"],
    themes=[],
    additional_interests=[],
    additional_notes=["Looking forward to the tour!"]
)

preferences = Preferences(
    exhibits=response.exhibits,
    time_period=response.time_period,
    art_medium=response.art_medium,
    themes=response.themes,
    additional_interests=response.additional_interests
)
print(preferences.count_preferences())

4


In [22]:
import json

def get_number_tag_matches(tags: Preferences, hit: Dict) -> float:
    """
    Get the number of tag matches between the user's preferences and a search result hit's metadata.

    Args:
        tags: A Preferences object with attributes like time_period, themes, exhibits, etc.
        hit: A dictionary representing a search result hit with metadata.

    Returns:
        A float score representing the number of matching tags.
    """
    score = 0
    for key in tags.__dict__.keys():
        if key in hit['metadata']:
            matches = set(hit['metadata'][key]) & set(getattr(tags, key))
            score += len(matches)
    return score

def search_tags(tags: Preferences, top_k=100):
        """
        Perform a metadata search based on the provided Preferences object.
        
        Args:
            tags: A Preferences object with attributes:
                - time_period: List[str]
                - themes: List[str]
                - exhibits: List[str]
                - art_medium: List[str]
                - additional_interests: List[str]
            top_k: Number of results to return.
            
        Returns:
            List of matching documents from dense_db.
        """
           # Convert the Preferences object to a dictionary
        tag_dict = tags.__dict__

        # Remove any keys with empty lists or None values
        tag_dict = {k: v for k, v in tag_dict.items() if v}

        # Dummy vector for metadata filtering (using the dimensions expected by your dense index)
        dummy_vector = np.zeros(512).tolist()

        # Build metadata filter conditions
        filter_conditions = []
        for key, values in tag_dict.items():
            # Ensure values is a list
            if not isinstance(values, list):
                values = [values]
            
            # Each condition looks for documents where a given field contains at least one of the values
            filter_conditions.append({key: {"$in": values}})
        
        # Use $or operator so that if any condition matches, the document is returned
        metadata_filter = {"$or": filter_conditions} if filter_conditions else {}
        
        print(f"Metadata filter: {metadata_filter}")
        
        response = dense_db.query(
            namespace="umag",
            # vector=dummy_vector,
            id='jvgpbga6', 
            top_k=top_k,
            include_metadata=True,
            # filter=metadata_filter
        )
        
        # if len(response.matches) == 0:
        #     print("No matches found for the given tags.")
        #     return []
        

        for i in range(len(response.matches)):
            # normalize the score based on the number of preferences
            response.matches[i]['score'] = get_number_tag_matches(tags, response.matches[i]) / tags.count_preferences()

        # sort matches by tag score
        response.matches.sort(key=lambda match: match['score'], reverse=True)
        return response.matches

In [14]:
from classes.survey import SurveyResponse, Preferences
from query_rewritter import generate_human_query
from retriever import DefaultRetriever


response9 = SurveyResponse(
    major="Art History",
    age_group="18-22",
    class_subject="Art History",
    exhibits=["objectifying_china"],
    tour_length_minutes=100,
    time_period=["ming"],  # Valid time period
    materiality=[""],  # Valid material
    themes=[""],  # Valid theme
    additional_interests=["dragons"],
    region=[""],  # Valid region
    colour=["wucai/five_colour"],  # Valid colour
    purpose=["artistic expression"],  # Valid purpose
    additional_notes=["analysis of the aesthetic qualities in ceramics"]
)

preferences9 = Preferences(
    exhibits=response9.exhibits,
    time_period=response9.time_period,
    materiality=response9.materiality,
    region=response9.region,
    colour=response9.colour,
    purpose=response9.purpose,
    themes=response9.themes,
    additional_interests=response9.additional_interests
)

In [24]:
retriever = DefaultRetriever()

sample_results = search_tags(preferences9, top_k=200)
sample_results

# for item in sample_results:
#     if item['id'] == 'jvgpbga6':
#         print("tag_results:", item['jvgpbga6'])

Metadata filter: {'$or': [{'exhibits': {'$in': ['objectifying_china']}}, {'time_period': {'$in': ['ming']}}, {'materiality': {'$in': ['']}}, {'region': {'$in': ['']}}, {'colour': {'$in': ['wucai/five_colour']}}, {'purpose': {'$in': ['artistic expression']}}, {'themes': {'$in': ['']}}, {'additional_interests': {'$in': ['dragons']}}]}


[{'id': 'jvgpbga6',
  'metadata': {'colour': ['wucai/five_colour', 'blue/cobalt', 'yellow', 'green'],
               'content': 'These ceramics are decorated in the wucai '
                          "('five-colour') palette, with outlines in underglaze "
                          'blue and surface enamels in red, yellow and green. '
                          'First developed during the reign of the Jiajing '
                          'emperor, the production of wucai porcelain '
                          'flourished in the Wanli period. These emperors '
                          'commissioned a variety of porcelain objects '
                          'decorated in this style, made in both the imperial '
                          'kilns and private commercial enterprises. These '
                          'plates, decorated on the interior with an imperial '
                          'five-clawed dragon and marked on the base with the '
                          'reign marks of the Wanl

In [42]:
print(sample_results)

[{'id': 'j1m6v08t',
 'metadata': {'colour': ['wucai/five_colour'],
              'content': 'With large flaring mouths, vessels of this shape '
                         'resemble inverted bells and are commonly referred to '
                         'as ‘chime-type cups’. Popular throughout the Ming '
                         'and Qing dynasties, this pair was made during the '
                         'reign of the Jiajing emperor. In keeping with the '
                         "emperor's interest in auspicious symbolism, the "
                         'three goats and suns surrounded by prunus, pine and '
                         'bamboo trees are another example of the use of '
                         'homophones in the Chinese language to create visual '
                         'puns. The goats and suns (both pronounced yang) form '
                         'a rebus that refers to a passage from the I Ching '
                         '(‘The Book of Changes’): “the first month of 

In [16]:
response = dense_db.query(
            namespace="umag",
            id='jvgpbga6', 
            top_k=2,
            include_metadata=True,
            # filter=metadata_filter
        )
response

{'matches': [{'id': 'jvgpbga6',
              'metadata': {'colour': ['wucai/five_colour',
                                      'blue/cobalt',
                                      'yellow',
                                      'green'],
                           'content': 'These ceramics are decorated in the '
                                      "wucai ('five-colour') palette, with "
                                      'outlines in underglaze blue and surface '
                                      'enamels in red, yellow and green. First '
                                      'developed during the reign of the '
                                      'Jiajing emperor, the production of '
                                      'wucai porcelain flourished in the Wanli '
                                      'period. These emperors commissioned a '
                                      'variety of porcelain objects decorated '
                                      'in this style,