# Import & Paths


In [18]:
import pandas as pd
import json
import os
import openai
import ast

from typing import Union, Dict, Optional

In [33]:
MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, "data")
DATABASE_DIR = os.path.join(DATA_DIR, "db")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)
    
os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]
openai.api_key = api_keys["OPENAI_API_KEY"]

In [26]:
def format_metadata(
    metadata: Union[str, Dict], prefix: Optional[str] = "Image features:", suffix: Optional[str] = None, separator: str = "\n- "
) -> str:
    if isinstance(metadata, str):
        metadata = ast.literal_eval(metadata)
    assert isinstance(metadata, Dict), "Metadata must be a dictionary"
    
    metadata_content = separator.join([f"{k}: {v}" for k, v in metadata.items()])
    if prefix:
        metadata_content = prefix + separator + metadata_content
    if suffix:
        metadata_content = metadata_content + separator + suffix
        
    return metadata_content

# Text embeddings generation

In [30]:
metadata_df = pd.read_csv(os.path.join(DATA_DIR, "exp_metadata.csv"))
formatted_metadatas = [format_metadata(metadata) for metadata in metadata_df["metadata"]]
formatted_metadatas

['Image features:\n- shape: CAPSULE\n- color: WHITE\n- imprint: Lilly;3227;10;mg\n- imprintColor: BLACK\n- imprintType: PRINTED',
 "Image features:\n- shape: CAPSULE\n- color: ['WHITE', 'BLUE']\n- imprint: Lilly;3228;25;mg\n- imprintColor: BLACK\n- imprintType: PRINTED",
 'Image features:\n- shape: CAPSULE\n- color: BLUE\n- imprint: Lilly;3229;40;mg\n- imprintColor: BLACK\n- imprintType: PRINTED',
 'Image features:\n- shape: CAPSULE\n- color: GREEN\n- imprint: Lilly;3235;20;mg\n- imprintColor: BLACK\n- imprintType: PRINTED',
 "Image features:\n- shape: CAPSULE\n- color: ['WHITE', 'YELLOW']\n- imprint: Lilly;3238;18;mg\n- imprintColor: BLACK\n- imprintType: PRINTED",
 "Image features:\n- shape: CAPSULE\n- color: ['BLUE', 'YELLOW']\n- imprint: Lilly;3239;60;mg\n- imprintColor: BLACK\n- imprintType: PRINTED",
 "Image features:\n- shape: CAPSULE\n- color: ['BLUE', 'WHITE']\n- imprint: Lilly;3240;30;mg\n- imprintColor: BROWN\n- imprintType: PRINTED",
 "Image features:\n- shape: CAPSULE\n- c

In [31]:
from llama_index.embeddings import OpenAIEmbedding
import numpy as np

embeddings = OpenAIEmbedding()
text_embs = embeddings.get_text_embedding_batch(formatted_metadatas, show_progress=True)

# np.save(os.path.join(DATABASE_DIR, "text-embeddings-ada-2_metadata.npy"), text_embs)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 2000/2000 [02:21<00:00, 14.18it/s]


# Image Embeddings generation

# Hybrid Database Creation

In [None]:
from llama_index.indices.base_retriever import BaseRetriever

class MultiModalRetriever(BaseRetriever):
    